aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /net/ipv4
downloadlinux-dev-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.xz
linux-dev-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.zip
Linux-2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig411
-rw-r--r--net/ipv4/Makefile33
-rw-r--r--net/ipv4/af_inet.c1188
-rw-r--r--net/ipv4/ah4.c335
-rw-r--r--net/ipv4/arp.c1425
-rw-r--r--net/ipv4/datagram.c73
-rw-r--r--net/ipv4/devinet.c1508
-rw-r--r--net/ipv4/esp4.c510
-rw-r--r--net/ipv4/fib_frontend.c611
-rw-r--r--net/ipv4/fib_hash.c1086
-rw-r--r--net/ipv4/fib_lookup.h43
-rw-r--r--net/ipv4/fib_rules.c437
-rw-r--r--net/ipv4/fib_semantics.c1332
-rw-r--r--net/ipv4/icmp.c1143
-rw-r--r--net/ipv4/igmp.c2473
-rw-r--r--net/ipv4/inetpeer.c460
-rw-r--r--net/ipv4/ip_forward.c127
-rw-r--r--net/ipv4/ip_fragment.c691
-rw-r--r--net/ipv4/ip_gre.c1290
-rw-r--r--net/ipv4/ip_input.c431
-rw-r--r--net/ipv4/ip_options.c625
-rw-r--r--net/ipv4/ip_output.c1359
-rw-r--r--net/ipv4/ip_sockglue.c1093
-rw-r--r--net/ipv4/ipcomp.c524
-rw-r--r--net/ipv4/ipconfig.c1507
-rw-r--r--net/ipv4/ipip.c905
-rw-r--r--net/ipv4/ipmr.c1900
-rw-r--r--net/ipv4/ipvs/Kconfig244
-rw-r--r--net/ipv4/ipvs/Makefile34
-rw-r--r--net/ipv4/ipvs/ip_vs_app.c658
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c920
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c1191
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c2391
-rw-r--r--net/ipv4/ipvs/ip_vs_dh.c258
-rw-r--r--net/ipv4/ipvs/ip_vs_est.c200
-rw-r--r--net/ipv4/ipvs/ip_vs_ftp.c400
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c624
-rw-r--r--net/ipv4/ipvs/ip_vs_lblcr.c888
-rw-r--r--net/ipv4/ipvs/ip_vs_lc.c123
-rw-r--r--net/ipv4/ipvs/ip_vs_nq.c161
-rw-r--r--net/ipv4/ipvs/ip_vs_proto.c244
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_ah.c177
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_esp.c175
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_icmp.c182
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_tcp.c640
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_udp.c427
-rw-r--r--net/ipv4/ipvs/ip_vs_rr.c118
-rw-r--r--net/ipv4/ipvs/ip_vs_sched.c251
-rw-r--r--net/ipv4/ipvs/ip_vs_sed.c163
-rw-r--r--net/ipv4/ipvs/ip_vs_sh.c255
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c892
-rw-r--r--net/ipv4/ipvs/ip_vs_wlc.c151
-rw-r--r--net/ipv4/ipvs/ip_vs_wrr.c235
-rw-r--r--net/ipv4/ipvs/ip_vs_xmit.c563
-rw-r--r--net/ipv4/multipath.c55
-rw-r--r--net/ipv4/multipath_drr.c265
-rw-r--r--net/ipv4/multipath_random.c128
-rw-r--r--net/ipv4/multipath_rr.c115
-rw-r--r--net/ipv4/multipath_wrandom.c344
-rw-r--r--net/ipv4/netfilter/Kconfig696
-rw-r--r--net/ipv4/netfilter/Makefile89
-rw-r--r--net/ipv4/netfilter/arp_tables.c1333
-rw-r--r--net/ipv4/netfilter/arpt_mangle.c104
-rw-r--r--net/ipv4/netfilter/arptable_filter.c214
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c167
-rw-r--r--net/ipv4/netfilter/ip_conntrack_core.c1247
-rw-r--r--net/ipv4/netfilter/ip_conntrack_ftp.c501
-rw-r--r--net/ipv4/netfilter/ip_conntrack_irc.c313
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_generic.c75
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_icmp.c279
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_sctp.c649
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_tcp.c1098
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_udp.c146
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c961
-rw-r--r--net/ipv4/netfilter/ip_conntrack_tftp.c159
-rw-r--r--net/ipv4/netfilter/ip_nat_amanda.c88
-rw-r--r--net/ipv4/netfilter/ip_nat_core.c556
-rw-r--r--net/ipv4/netfilter/ip_nat_ftp.c183
-rw-r--r--net/ipv4/netfilter/ip_nat_helper.c430
-rw-r--r--net/ipv4/netfilter/ip_nat_irc.c125
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_icmp.c115
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_tcp.c178
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_udp.c165
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_unknown.c70
-rw-r--r--net/ipv4/netfilter/ip_nat_rule.c319
-rw-r--r--net/ipv4/netfilter/ip_nat_snmp_basic.c1347
-rw-r--r--net/ipv4/netfilter/ip_nat_standalone.c349
-rw-r--r--net/ipv4/netfilter/ip_nat_tftp.c70
-rw-r--r--net/ipv4/netfilter/ip_queue.c741
-rw-r--r--net/ipv4/netfilter/ip_tables.c1964
-rw-r--r--net/ipv4/netfilter/ipt_CLASSIFY.c92
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c761
-rw-r--r--net/ipv4/netfilter/ipt_CONNMARK.c118
-rw-r--r--net/ipv4/netfilter/ipt_DSCP.c106
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c175
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c485
-rw-r--r--net/ipv4/netfilter/ipt_MARK.c162
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c207
-rw-r--r--net/ipv4/netfilter/ipt_NETMAP.c117
-rw-r--r--net/ipv4/netfilter/ipt_NOTRACK.c76
-rw-r--r--net/ipv4/netfilter/ipt_REDIRECT.c129
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c335
-rw-r--r--net/ipv4/netfilter/ipt_SAME.c211
-rw-r--r--net/ipv4/netfilter/ipt_TCPMSS.c262
-rw-r--r--net/ipv4/netfilter/ipt_TOS.c105
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c419
-rw-r--r--net/ipv4/netfilter/ipt_addrtype.c77
-rw-r--r--net/ipv4/netfilter/ipt_ah.c117
-rw-r--r--net/ipv4/netfilter/ipt_comment.c59
-rw-r--r--net/ipv4/netfilter/ipt_connmark.c81
-rw-r--r--net/ipv4/netfilter/ipt_conntrack.c136
-rw-r--r--net/ipv4/netfilter/ipt_dscp.c63
-rw-r--r--net/ipv4/netfilter/ipt_ecn.c131
-rw-r--r--net/ipv4/netfilter/ipt_esp.c118
-rw-r--r--net/ipv4/netfilter/ipt_hashlimit.c731
-rw-r--r--net/ipv4/netfilter/ipt_helper.c113
-rw-r--r--net/ipv4/netfilter/ipt_iprange.c99
-rw-r--r--net/ipv4/netfilter/ipt_length.c64
-rw-r--r--net/ipv4/netfilter/ipt_limit.c157
-rw-r--r--net/ipv4/netfilter/ipt_mac.c79
-rw-r--r--net/ipv4/netfilter/ipt_mark.c64
-rw-r--r--net/ipv4/netfilter/ipt_multiport.c212
-rw-r--r--net/ipv4/netfilter/ipt_owner.c217
-rw-r--r--net/ipv4/netfilter/ipt_physdev.c134
-rw-r--r--net/ipv4/netfilter/ipt_pkttype.c70
-rw-r--r--net/ipv4/netfilter/ipt_realm.c76
-rw-r--r--net/ipv4/netfilter/ipt_recent.c1002
-rw-r--r--net/ipv4/netfilter/ipt_sctp.c203
-rw-r--r--net/ipv4/netfilter/ipt_state.c74
-rw-r--r--net/ipv4/netfilter/ipt_tcpmss.c127
-rw-r--r--net/ipv4/netfilter/ipt_tos.c64
-rw-r--r--net/ipv4/netfilter/ipt_ttl.c79
-rw-r--r--net/ipv4/netfilter/iptable_filter.c194
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c260
-rw-r--r--net/ipv4/netfilter/iptable_raw.c156
-rw-r--r--net/ipv4/proc.c382
-rw-r--r--net/ipv4/protocol.c101
-rw-r--r--net/ipv4/raw.c888
-rw-r--r--net/ipv4/route.c3177
-rw-r--r--net/ipv4/syncookies.c279
-rw-r--r--net/ipv4/sysctl_net_ipv4.c698
-rw-r--r--net/ipv4/tcp.c2386
-rw-r--r--net/ipv4/tcp_diag.c802
-rw-r--r--net/ipv4/tcp_input.c4959
-rw-r--r--net/ipv4/tcp_ipv4.c2663
-rw-r--r--net/ipv4/tcp_minisocks.c1077
-rw-r--r--net/ipv4/tcp_output.c1739
-rw-r--r--net/ipv4/tcp_timer.c656
-rw-r--r--net/ipv4/udp.c1575
-rw-r--r--net/ipv4/utils.c59
-rw-r--r--net/ipv4/xfrm4_input.c160
-rw-r--r--net/ipv4/xfrm4_output.c141
-rw-r--r--net/ipv4/xfrm4_policy.c281
-rw-r--r--net/ipv4/xfrm4_state.c126
-rw-r--r--net/ipv4/xfrm4_tunnel.c144
155 files changed, 82733 insertions, 0 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
new file mode 100644
index 000000000000..6d3e8b1bd1f2
--- /dev/null
+++ b/net/ipv4/Kconfig
@@ -0,0 +1,411 @@
+#
+# IP configuration
+#
+config IP_MULTICAST
+ bool "IP: multicasting"
+ depends on INET
+ help
+ This is code for addressing several networked computers at once,
+ enlarging your kernel by about 2 KB. You need multicasting if you
+ intend to participate in the MBONE, a high bandwidth network on top
+ of the Internet which carries audio and video broadcasts. More
+ information about the MBONE is on the WWW at
+ <http://www-itg.lbl.gov/mbone/>. Information about the multicast
+ capabilities of the various network cards is contained in
+ <file:Documentation/networking/multicast.txt>. For most people, it's
+ safe to say N.
+
+config IP_ADVANCED_ROUTER
+ bool "IP: advanced router"
+ depends on INET
+ ---help---
+ If you intend to run your Linux box mostly as a router, i.e. as a
+ computer that forwards and redistributes network packets, say Y; you
+ will then be presented with several options that allow more precise
+ control about the routing process.
+
+ The answer to this question won't directly affect the kernel:
+ answering N will just cause the configurator to skip all the
+ questions about advanced routing.
+
+ Note that your box can only act as a router if you enable IP
+ forwarding in your kernel; you can do that by saying Y to "/proc
+ file system support" and "Sysctl support" below and executing the
+ line
+
+ echo "1" > /proc/sys/net/ipv4/ip_forward
+
+ at boot time after the /proc file system has been mounted.
+
+ If you turn on IP forwarding, you will also get the rp_filter, which
+ automatically rejects incoming packets if the routing table entry
+ for their source address doesn't match the network interface they're
+ arriving on. This has security advantages because it prevents the
+ so-called IP spoofing, however it can pose problems if you use
+ asymmetric routing (packets from you to a host take a different path
+ than packets from that host to you) or if you operate a non-routing
+ host which has several IP addresses on different interfaces. To turn
+ rp_filter off use:
+
+ echo 0 > /proc/sys/net/ipv4/conf/<device>/rp_filter
+ or
+ echo 0 > /proc/sys/net/ipv4/conf/all/rp_filter
+
+ If unsure, say N here.
+
+config IP_MULTIPLE_TABLES
+ bool "IP: policy routing"
+ depends on IP_ADVANCED_ROUTER
+ ---help---
+ Normally, a router decides what to do with a received packet based
+ solely on the packet's final destination address. If you say Y here,
+ the Linux router will also be able to take the packet's source
+ address into account. Furthermore, the TOS (Type-Of-Service) field
+ of the packet can be used for routing decisions as well.
+
+ If you are interested in this, please see the preliminary
+ documentation at <http://www.compendium.com.ar/policy-routing.txt>
+ and <ftp://post.tepkom.ru/pub/vol2/Linux/docs/advanced-routing.tex>.
+ You will need supporting software from
+ <ftp://ftp.tux.org/pub/net/ip-routing/>.
+
+ If unsure, say N.
+
+config IP_ROUTE_FWMARK
+ bool "IP: use netfilter MARK value as routing key"
+ depends on IP_MULTIPLE_TABLES && NETFILTER
+ help
+ If you say Y here, you will be able to specify different routes for
+ packets with different mark values (see iptables(8), MARK target).
+
+config IP_ROUTE_MULTIPATH
+ bool "IP: equal cost multipath"
+ depends on IP_ADVANCED_ROUTER
+ help
+ Normally, the routing tables specify a single action to be taken in
+ a deterministic manner for a given packet. If you say Y here
+ however, it becomes possible to attach several actions to a packet
+ pattern, in effect specifying several alternative paths to travel
+ for those packets. The router considers all these paths to be of
+ equal "cost" and chooses one of them in a non-deterministic fashion
+ if a matching packet arrives.
+
+config IP_ROUTE_MULTIPATH_CACHED
+ bool "IP: equal cost multipath with caching support (EXPERIMENTAL)"
+ depends on: IP_ROUTE_MULTIPATH
+ help
+ Normally, equal cost multipath routing is not supported by the
+ routing cache. If you say Y here, alternative routes are cached
+ and on cache lookup a route is chosen in a configurable fashion.
+
+ If unsure, say N.
+
+config IP_ROUTE_MULTIPATH_RR
+ tristate "MULTIPATH: round robin algorithm"
+ depends on IP_ROUTE_MULTIPATH_CACHED
+ help
+ Mulitpath routes are chosen according to Round Robin
+
+config IP_ROUTE_MULTIPATH_RANDOM
+ tristate "MULTIPATH: random algorithm"
+ depends on IP_ROUTE_MULTIPATH_CACHED
+ help
+ Multipath routes are chosen in a random fashion. Actually,
+ there is no weight for a route. The advantage of this policy
+ is that it is implemented stateless and therefore introduces only
+ a very small delay.
+
+config IP_ROUTE_MULTIPATH_WRANDOM
+ tristate "MULTIPATH: weighted random algorithm"
+ depends on IP_ROUTE_MULTIPATH_CACHED
+ help
+ Multipath routes are chosen in a weighted random fashion.
+ The per route weights are the weights visible via ip route 2. As the
+ corresponding state management introduces some overhead routing delay
+ is increased.
+
+config IP_ROUTE_MULTIPATH_DRR
+ tristate "MULTIPATH: interface round robin algorithm"
+ depends on IP_ROUTE_MULTIPATH_CACHED
+ help
+ Connections are distributed in a round robin fashion over the
+ available interfaces. This policy makes sense if the connections
+ should be primarily distributed on interfaces and not on routes.
+
+config IP_ROUTE_VERBOSE
+ bool "IP: verbose route monitoring"
+ depends on IP_ADVANCED_ROUTER
+ help
+ If you say Y here, which is recommended, then the kernel will print
+ verbose messages regarding the routing, for example warnings about
+ received packets which look strange and could be evidence of an
+ attack or a misconfigured system somewhere. The information is
+ handled by the klogd daemon which is responsible for kernel messages
+ ("man klogd").
+
+config IP_PNP
+ bool "IP: kernel level autoconfiguration"
+ depends on INET
+ help
+ This enables automatic configuration of IP addresses of devices and
+ of the routing table during kernel boot, based on either information
+ supplied on the kernel command line or by BOOTP or RARP protocols.
+ You need to say Y only for diskless machines requiring network
+ access to boot (in which case you want to say Y to "Root file system
+ on NFS" as well), because all other machines configure the network
+ in their startup scripts.
+
+config IP_PNP_DHCP
+ bool "IP: DHCP support"
+ depends on IP_PNP
+ ---help---
+ If you want your Linux box to mount its whole root file system (the
+ one containing the directory /) from some other computer over the
+ net via NFS and you want the IP address of your computer to be
+ discovered automatically at boot time using the DHCP protocol (a
+ special protocol designed for doing this job), say Y here. In case
+ the boot ROM of your network card was designed for booting Linux and
+ does DHCP itself, providing all necessary information on the kernel
+ command line, you can say N here.
+
+ If unsure, say Y. Note that if you want to use DHCP, a DHCP server
+ must be operating on your network. Read
+ <file:Documentation/nfsroot.txt> for details.
+
+config IP_PNP_BOOTP
+ bool "IP: BOOTP support"
+ depends on IP_PNP
+ ---help---
+ If you want your Linux box to mount its whole root file system (the
+ one containing the directory /) from some other computer over the
+ net via NFS and you want the IP address of your computer to be
+ discovered automatically at boot time using the BOOTP protocol (a
+ special protocol designed for doing this job), say Y here. In case
+ the boot ROM of your network card was designed for booting Linux and
+ does BOOTP itself, providing all necessary information on the kernel
+ command line, you can say N here. If unsure, say Y. Note that if you
+ want to use BOOTP, a BOOTP server must be operating on your network.
+ Read <file:Documentation/nfsroot.txt> for details.
+
+config IP_PNP_RARP
+ bool "IP: RARP support"
+ depends on IP_PNP
+ help
+ If you want your Linux box to mount its whole root file system (the
+ one containing the directory /) from some other computer over the
+ net via NFS and you want the IP address of your computer to be
+ discovered automatically at boot time using the RARP protocol (an
+ older protocol which is being obsoleted by BOOTP and DHCP), say Y
+ here. Note that if you want to use RARP, a RARP server must be
+ operating on your network. Read <file:Documentation/nfsroot.txt> for
+ details.
+
+# not yet ready..
+# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
+config NET_IPIP
+ tristate "IP: tunneling"
+ depends on INET
+ select INET_TUNNEL
+ ---help---
+ Tunneling means encapsulating data of one protocol type within
+ another protocol and sending it over a channel that understands the
+ encapsulating protocol. This particular tunneling driver implements
+ encapsulation of IP within IP, which sounds kind of pointless, but
+ can be useful if you want to make your (or some other) machine
+ appear on a different network than it physically is, or to use
+ mobile-IP facilities (allowing laptops to seamlessly move between
+ networks without changing their IP addresses).
+
+ Saying Y to this option will produce two modules ( = code which can
+ be inserted in and removed from the running kernel whenever you
+ want). Most people won't need this and can say N.
+
+config NET_IPGRE
+ tristate "IP: GRE tunnels over IP"
+ depends on INET
+ select XFRM
+ help
+ Tunneling means encapsulating data of one protocol type within
+ another protocol and sending it over a channel that understands the
+ encapsulating protocol. This particular tunneling driver implements
+ GRE (Generic Routing Encapsulation) and at this time allows
+ encapsulating of IPv4 or IPv6 over existing IPv4 infrastructure.
+ This driver is useful if the other endpoint is a Cisco router: Cisco
+ likes GRE much better than the other Linux tunneling driver ("IP
+ tunneling" above). In addition, GRE allows multicast redistribution
+ through the tunnel.
+
+config NET_IPGRE_BROADCAST
+ bool "IP: broadcast GRE over IP"
+ depends on IP_MULTICAST && NET_IPGRE
+ help
+ One application of GRE/IP is to construct a broadcast WAN (Wide Area
+ Network), which looks like a normal Ethernet LAN (Local Area
+ Network), but can be distributed all over the Internet. If you want
+ to do that, say Y here and to "IP multicast routing" below.
+
+config IP_MROUTE
+ bool "IP: multicast routing"
+ depends on IP_MULTICAST
+ help
+ This is used if you want your machine to act as a router for IP
+ packets that have several destination addresses. It is needed on the
+ MBONE, a high bandwidth network on top of the Internet which carries
+ audio and video broadcasts. In order to do that, you would most
+ likely run the program mrouted. Information about the multicast
+ capabilities of the various network cards is contained in
+ <file:Documentation/networking/multicast.txt>. If you haven't heard
+ about it, you don't need it.
+
+config IP_PIMSM_V1
+ bool "IP: PIM-SM version 1 support"
+ depends on IP_MROUTE
+ help
+ Kernel side support for Sparse Mode PIM (Protocol Independent
+ Multicast) version 1. This multicast routing protocol is used widely
+ because Cisco supports it. You need special software to use it
+ (pimd-v1). Please see <http://netweb.usc.edu/pim/> for more
+ information about PIM.
+
+ Say Y if you want to use PIM-SM v1. Note that you can say N here if
+ you just want to use Dense Mode PIM.
+
+config IP_PIMSM_V2
+ bool "IP: PIM-SM version 2 support"
+ depends on IP_MROUTE
+ help
+ Kernel side support for Sparse Mode PIM version 2. In order to use
+ this, you need an experimental routing daemon supporting it (pimd or
+ gated-5). This routing protocol is not used widely, so say N unless
+ you want to play with it.
+
+config ARPD
+ bool "IP: ARP daemon support (EXPERIMENTAL)"
+ depends on INET && EXPERIMENTAL
+ ---help---
+ Normally, the kernel maintains an internal cache which maps IP
+ addresses to hardware addresses on the local network, so that
+ Ethernet/Token Ring/ etc. frames are sent to the proper address on
+ the physical networking layer. For small networks having a few
+ hundred directly connected hosts or less, keeping this address
+ resolution (ARP) cache inside the kernel works well. However,
+ maintaining an internal ARP cache does not work well for very large
+ switched networks, and will use a lot of kernel memory if TCP/IP
+ connections are made to many machines on the network.
+
+ If you say Y here, the kernel's internal ARP cache will never grow
+ to more than 256 entries (the oldest entries are expired in a LIFO
+ manner) and communication will be attempted with the user space ARP
+ daemon arpd. Arpd then answers the address resolution request either
+ from its own cache or by asking the net.
+
+ This code is experimental and also obsolete. If you want to use it,
+ you need to find a version of the daemon arpd on the net somewhere,
+ and you should also say Y to "Kernel/User network link driver",
+ below. If unsure, say N.
+
+config SYN_COOKIES
+ bool "IP: TCP syncookie support (disabled per default)"
+ depends on INET
+ ---help---
+ Normal TCP/IP networking is open to an attack known as "SYN
+ flooding". This denial-of-service attack prevents legitimate remote
+ users from being able to connect to your computer during an ongoing
+ attack and requires very little work from the attacker, who can
+ operate from anywhere on the Internet.
+
+ SYN cookies provide protection against this type of attack. If you
+ say Y here, the TCP/IP stack will use a cryptographic challenge
+ protocol known as "SYN cookies" to enable legitimate users to
+ continue to connect, even when your machine is under attack. There
+ is no need for the legitimate users to change their TCP/IP software;
+ SYN cookies work transparently to them. For technical information
+ about SYN cookies, check out <http://cr.yp.to/syncookies.html>.
+
+ If you are SYN flooded, the source address reported by the kernel is
+ likely to have been forged by the attacker; it is only reported as
+ an aid in tracing the packets to their actual source and should not
+ be taken as absolute truth.
+
+ SYN cookies may prevent correct error reporting on clients when the
+ server is really overloaded. If this happens frequently better turn
+ them off.
+
+ If you say Y here, note that SYN cookies aren't enabled by default;
+ you can enable them by saying Y to "/proc file system support" and
+ "Sysctl support" below and executing the command
+
+ echo 1 >/proc/sys/net/ipv4/tcp_syncookies
+
+ at boot time after the /proc file system has been mounted.
+
+ If unsure, say N.
+
+config INET_AH
+ tristate "IP: AH transformation"
+ depends on INET
+ select XFRM
+ select CRYPTO
+ select CRYPTO_HMAC
+ select CRYPTO_MD5
+ select CRYPTO_SHA1
+ ---help---
+ Support for IPsec AH.
+
+ If unsure, say Y.
+
+config INET_ESP
+ tristate "IP: ESP transformation"
+ depends on INET
+ select XFRM
+ select CRYPTO
+ select CRYPTO_HMAC
+ select CRYPTO_MD5
+ select CRYPTO_SHA1
+ select CRYPTO_DES
+ ---help---
+ Support for IPsec ESP.
+
+ If unsure, say Y.
+
+config INET_IPCOMP
+ tristate "IP: IPComp transformation"
+ depends on INET
+ select XFRM
+ select INET_TUNNEL
+ select CRYPTO
+ select CRYPTO_DEFLATE
+ ---help---
+ Support for IP Payload Compression Protocol (IPComp) (RFC3173),
+ typically needed for IPsec.
+
+ If unsure, say Y.
+
+config INET_TUNNEL
+ tristate "IP: tunnel transformation"
+ depends on INET
+ select XFRM
+ ---help---
+ Support for generic IP tunnel transformation, which is required by
+ the IP tunneling module as well as tunnel mode IPComp.
+
+ If unsure, say Y.
+
+config IP_TCPDIAG
+ tristate "IP: TCP socket monitoring interface"
+ depends on INET
+ default y
+ ---help---
+ Support for TCP socket monitoring interface used by native Linux
+ tools such as ss. ss is included in iproute2, currently downloadable
+ at <http://developer.osdl.org/dev/iproute2>. If you want IPv6 support
+ and have selected IPv6 as a module, you need to build this as a
+ module too.
+
+ If unsure, say Y.
+
+config IP_TCPDIAG_IPV6
+ def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6)
+
+source "net/ipv4/ipvs/Kconfig"
+
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
new file mode 100644
index 000000000000..8b379627ebb6
--- /dev/null
+++ b/net/ipv4/Makefile
@@ -0,0 +1,33 @@
+#
+# Makefile for the Linux TCP/IP (INET) layer.
+#
+
+obj-y := utils.o route.o inetpeer.o protocol.o \
+ ip_input.o ip_fragment.o ip_forward.o ip_options.o \
+ ip_output.o ip_sockglue.o \
+ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \
+ datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
+ sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o
+
+obj-$(CONFIG_PROC_FS) += proc.o
+obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
+obj-$(CONFIG_IP_MROUTE) += ipmr.o
+obj-$(CONFIG_NET_IPIP) += ipip.o
+obj-$(CONFIG_NET_IPGRE) += ip_gre.o
+obj-$(CONFIG_SYN_COOKIES) += syncookies.o
+obj-$(CONFIG_INET_AH) += ah4.o
+obj-$(CONFIG_INET_ESP) += esp4.o
+obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
+obj-$(CONFIG_INET_TUNNEL) += xfrm4_tunnel.o
+obj-$(CONFIG_IP_PNP) += ipconfig.o
+obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o
+obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o
+obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o
+obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o
+obj-$(CONFIG_NETFILTER) += netfilter/
+obj-$(CONFIG_IP_VS) += ipvs/
+obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o
+obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
+
+obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
+ xfrm4_output.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
new file mode 100644
index 000000000000..c34dab67e461
--- /dev/null
+++ b/net/ipv4/af_inet.c
@@ -0,0 +1,1188 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * PF_INET protocol family socket handler.
+ *
+ * Version: $Id: af_inet.c,v 1.137 2002/02/01 22:01:03 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Alan Cox, <A.Cox@swansea.ac.uk>
+ *
+ * Changes (see also sock.c)
+ *
+ * piggy,
+ * Karl Knutson : Socket protocol table
+ * A.N.Kuznetsov : Socket death error in accept().
+ * John Richardson : Fix non blocking error in connect()
+ * so sockets that fail to connect
+ * don't return -EINPROGRESS.
+ * Alan Cox : Asynchronous I/O support
+ * Alan Cox : Keep correct socket pointer on sock
+ * structures
+ * when accept() ed
+ * Alan Cox : Semantics of SO_LINGER aren't state
+ * moved to close when you look carefully.
+ * With this fixed and the accept bug fixed
+ * some RPC stuff seems happier.
+ * Niibe Yutaka : 4.4BSD style write async I/O
+ * Alan Cox,
+ * Tony Gale : Fixed reuse semantics.
+ * Alan Cox : bind() shouldn't abort existing but dead
+ * sockets. Stops FTP netin:.. I hope.
+ * Alan Cox : bind() works correctly for RAW sockets.
+ * Note that FreeBSD at least was broken
+ * in this respect so be careful with
+ * compatibility tests...
+ * Alan Cox : routing cache support
+ * Alan Cox : memzero the socket structure for
+ * compactness.
+ * Matt Day : nonblock connect error handler
+ * Alan Cox : Allow large numbers of pending sockets
+ * (eg for big web sites), but only if
+ * specifically application requested.
+ * Alan Cox : New buffering throughout IP. Used
+ * dumbly.
+ * Alan Cox : New buffering now used smartly.
+ * Alan Cox : BSD rather than common sense
+ * interpretation of listen.
+ * Germano Caronni : Assorted small races.
+ * Alan Cox : sendmsg/recvmsg basic support.
+ * Alan Cox : Only sendmsg/recvmsg now supported.
+ * Alan Cox : Locked down bind (see security list).
+ * Alan Cox : Loosened bind a little.
+ * Mike McLagan : ADD/DEL DLCI Ioctls
+ * Willy Konynenberg : Transparent proxying support.
+ * David S. Miller : New socket lookup architecture.
+ * Some other random speedups.
+ * Cyrus Durgin : Cleaned up file for kmod hacks.
+ * Andi Kleen : Fix inet_stream_connect TCP race.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+#include <linux/smp_lock.h>
+#include <linux/inet.h>
+#include <linux/igmp.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/arp.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/raw.h>
+#include <net/icmp.h>
+#include <net/ipip.h>
+#include <net/inet_common.h>
+#include <net/xfrm.h>
+#ifdef CONFIG_IP_MROUTE
+#include <linux/mroute.h>
+#endif
+
+DEFINE_SNMP_STAT(struct linux_mib, net_statistics);
+
+#ifdef INET_REFCNT_DEBUG
+atomic_t inet_sock_nr;
+#endif
+
+extern void ip_mc_drop_socket(struct sock *sk);
+
+/* The inetsw table contains everything that inet_create needs to
+ * build a new socket.
+ */
+static struct list_head inetsw[SOCK_MAX];
+static DEFINE_SPINLOCK(inetsw_lock);
+
+/* New destruction routine */
+
+void inet_sock_destruct(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ __skb_queue_purge(&sk->sk_receive_queue);
+ __skb_queue_purge(&sk->sk_error_queue);
+
+ if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
+ printk("Attempt to release TCP socket in state %d %p\n",
+ sk->sk_state, sk);
+ return;
+ }
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ printk("Attempt to release alive inet socket %p\n", sk);
+ return;
+ }
+
+ BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
+ BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
+ BUG_TRAP(!sk->sk_wmem_queued);
+ BUG_TRAP(!sk->sk_forward_alloc);
+
+ if (inet->opt)
+ kfree(inet->opt);
+ dst_release(sk->sk_dst_cache);
+#ifdef INET_REFCNT_DEBUG
+ atomic_dec(&inet_sock_nr);
+ printk(KERN_DEBUG "INET socket %p released, %d are still alive\n",
+ sk, atomic_read(&inet_sock_nr));
+#endif
+}
+
+/*
+ * The routines beyond this point handle the behaviour of an AF_INET
+ * socket object. Mostly it punts to the subprotocols of IP to do
+ * the work.
+ */
+
+/*
+ * Automatically bind an unbound socket.
+ */
+
+static int inet_autobind(struct sock *sk)
+{
+ struct inet_sock *inet;
+ /* We may need to bind the socket. */
+ lock_sock(sk);
+ inet = inet_sk(sk);
+ if (!inet->num) {
+ if (sk->sk_prot->get_port(sk, 0)) {
+ release_sock(sk);
+ return -EAGAIN;
+ }
+ inet->sport = htons(inet->num);
+ }
+ release_sock(sk);
+ return 0;
+}
+
+/*
+ * Move a socket into listening state.
+ */
+int inet_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ unsigned char old_state;
+ int err;
+
+ lock_sock(sk);
+
+ err = -EINVAL;
+ if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
+ goto out;
+
+ old_state = sk->sk_state;
+ if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+ goto out;
+
+ /* Really, if the socket is already in listen state
+ * we can only allow the backlog to be adjusted.
+ */
+ if (old_state != TCP_LISTEN) {
+ err = tcp_listen_start(sk);
+ if (err)
+ goto out;
+ }
+ sk->sk_max_ack_backlog = backlog;
+ err = 0;
+
+out:
+ release_sock(sk);
+ return err;
+}
+
+/*
+ * Create an inet socket.
+ */
+
+static int inet_create(struct socket *sock, int protocol)
+{
+ struct sock *sk;
+ struct list_head *p;
+ struct inet_protosw *answer;
+ struct inet_sock *inet;
+ struct proto *answer_prot;
+ unsigned char answer_flags;
+ char answer_no_check;
+ int err;
+
+ sock->state = SS_UNCONNECTED;
+
+ /* Look for the requested type/protocol pair. */
+ answer = NULL;
+ rcu_read_lock();
+ list_for_each_rcu(p, &inetsw[sock->type]) {
+ answer = list_entry(p, struct inet_protosw, list);
+
+ /* Check the non-wild match. */
+ if (protocol == answer->protocol) {
+ if (protocol != IPPROTO_IP)
+ break;
+ } else {
+ /* Check for the two wild cases. */
+ if (IPPROTO_IP == protocol) {
+ protocol = answer->protocol;
+ break;
+ }
+ if (IPPROTO_IP == answer->protocol)
+ break;
+ }
+ answer = NULL;
+ }
+
+ err = -ESOCKTNOSUPPORT;
+ if (!answer)
+ goto out_rcu_unlock;
+ err = -EPERM;
+ if (answer->capability > 0 && !capable(answer->capability))
+ goto out_rcu_unlock;
+ err = -EPROTONOSUPPORT;
+ if (!protocol)
+ goto out_rcu_unlock;
+
+ sock->ops = answer->ops;
+ answer_prot = answer->prot;
+ answer_no_check = answer->no_check;
+ answer_flags = answer->flags;
+ rcu_read_unlock();
+
+ BUG_TRAP(answer_prot->slab != NULL);
+
+ err = -ENOBUFS;
+ sk = sk_alloc(PF_INET, GFP_KERNEL, answer_prot, 1);
+ if (sk == NULL)
+ goto out;
+
+ err = 0;
+ sk->sk_no_check = answer_no_check;
+ if (INET_PROTOSW_REUSE & answer_flags)
+ sk->sk_reuse = 1;
+
+ inet = inet_sk(sk);
+
+ if (SOCK_RAW == sock->type) {
+ inet->num = protocol;
+ if (IPPROTO_RAW == protocol)
+ inet->hdrincl = 1;
+ }
+
+ if (ipv4_config.no_pmtu_disc)
+ inet->pmtudisc = IP_PMTUDISC_DONT;
+ else
+ inet->pmtudisc = IP_PMTUDISC_WANT;
+
+ inet->id = 0;
+
+ sock_init_data(sock, sk);
+
+ sk->sk_destruct = inet_sock_destruct;
+ sk->sk_family = PF_INET;
+ sk->sk_protocol = protocol;
+ sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
+
+ inet->uc_ttl = -1;
+ inet->mc_loop = 1;
+ inet->mc_ttl = 1;
+ inet->mc_index = 0;
+ inet->mc_list = NULL;
+
+#ifdef INET_REFCNT_DEBUG
+ atomic_inc(&inet_sock_nr);
+#endif
+
+ if (inet->num) {
+ /* It assumes that any protocol which allows
+ * the user to assign a number at socket
+ * creation time automatically
+ * shares.
+ */
+ inet->sport = htons(inet->num);
+ /* Add to protocol hash chains. */
+ sk->sk_prot->hash(sk);
+ }
+
+ if (sk->sk_prot->init) {
+ err = sk->sk_prot->init(sk);
+ if (err)
+ sk_common_release(sk);
+ }
+out:
+ return err;
+out_rcu_unlock:
+ rcu_read_unlock();
+ goto out;
+}
+
+
+/*
+ * The peer socket should always be NULL (or else). When we call this
+ * function we are destroying the object and from then on nobody
+ * should refer to it.
+ */
+int inet_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk) {
+ long timeout;
+
+ /* Applications forget to leave groups before exiting */
+ ip_mc_drop_socket(sk);
+
+ /* If linger is set, we don't return until the close
+ * is complete. Otherwise we return immediately. The
+ * actually closing is done the same either way.
+ *
+ * If the close is due to the process exiting, we never
+ * linger..
+ */
+ timeout = 0;
+ if (sock_flag(sk, SOCK_LINGER) &&
+ !(current->flags & PF_EXITING))
+ timeout = sk->sk_lingertime;
+ sock->sk = NULL;
+ sk->sk_prot->close(sk, timeout);
+ }
+ return 0;
+}
+
+/* It is off by default, see below. */
+int sysctl_ip_nonlocal_bind;
+
+int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+ struct sock *sk = sock->sk;
+ struct inet_sock *inet = inet_sk(sk);
+ unsigned short snum;
+ int chk_addr_ret;
+ int err;
+
+ /* If the socket has its own bind function then use it. (RAW) */
+ if (sk->sk_prot->bind) {
+ err = sk->sk_prot->bind(sk, uaddr, addr_len);
+ goto out;
+ }
+ err = -EINVAL;
+ if (addr_len < sizeof(struct sockaddr_in))
+ goto out;
+
+ chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
+
+ /* Not specified by any standard per-se, however it breaks too
+ * many applications when removed. It is unfortunate since
+ * allowing applications to make a non-local bind solves
+ * several problems with systems using dynamic addressing.
+ * (ie. your servers still start up even if your ISDN link
+ * is temporarily down)
+ */
+ err = -EADDRNOTAVAIL;
+ if (!sysctl_ip_nonlocal_bind &&
+ !inet->freebind &&
+ addr->sin_addr.s_addr != INADDR_ANY &&
+ chk_addr_ret != RTN_LOCAL &&
+ chk_addr_ret != RTN_MULTICAST &&
+ chk_addr_ret != RTN_BROADCAST)
+ goto out;
+
+ snum = ntohs(addr->sin_port);
+ err = -EACCES;
+ if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+ goto out;
+
+ /* We keep a pair of addresses. rcv_saddr is the one
+ * used by hash lookups, and saddr is used for transmit.
+ *
+ * In the BSD API these are the same except where it
+ * would be illegal to use them (multicast/broadcast) in
+ * which case the sending device address is used.
+ */
+ lock_sock(sk);
+
+ /* Check these errors (active socket, double bind). */
+ err = -EINVAL;
+ if (sk->sk_state != TCP_CLOSE || inet->num)
+ goto out_release_sock;
+
+ inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;
+ if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
+ inet->saddr = 0; /* Use device */
+
+ /* Make sure we are allowed to bind here. */
+ if (sk->sk_prot->get_port(sk, snum)) {
+ inet->saddr = inet->rcv_saddr = 0;
+ err = -EADDRINUSE;
+ goto out_release_sock;
+ }
+
+ if (inet->rcv_saddr)
+ sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+ if (snum)
+ sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+ inet->sport = htons(inet->num);
+ inet->daddr = 0;
+ inet->dport = 0;
+ sk_dst_reset(sk);
+ err = 0;
+out_release_sock:
+ release_sock(sk);
+out:
+ return err;
+}
+
+int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
+ int addr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+
+ if (uaddr->sa_family == AF_UNSPEC)
+ return sk->sk_prot->disconnect(sk, flags);
+
+ if (!inet_sk(sk)->num && inet_autobind(sk))
+ return -EAGAIN;
+ return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
+}
+
+static long inet_wait_for_connect(struct sock *sk, long timeo)
+{
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+
+ /* Basic assumption: if someone sets sk->sk_err, he _must_
+ * change state of the socket from TCP_SYN_*.
+ * Connect() does not allow to get error notifications
+ * without closing the socket.
+ */
+ while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+ if (signal_pending(current) || !timeo)
+ break;
+ prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ }
+ finish_wait(sk->sk_sleep, &wait);
+ return timeo;
+}
+
+/*
+ * Connect to a remote host. There is regrettably still a little
+ * TCP 'magic' in here.
+ */
+int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+ int err;
+ long timeo;
+
+ lock_sock(sk);
+
+ if (uaddr->sa_family == AF_UNSPEC) {
+ err = sk->sk_prot->disconnect(sk, flags);
+ sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+ goto out;
+ }
+
+ switch (sock->state) {
+ default:
+ err = -EINVAL;
+ goto out;
+ case SS_CONNECTED:
+ err = -EISCONN;
+ goto out;
+ case SS_CONNECTING:
+ err = -EALREADY;
+ /* Fall out of switch with err, set for this state */
+ break;
+ case SS_UNCONNECTED:
+ err = -EISCONN;
+ if (sk->sk_state != TCP_CLOSE)
+ goto out;
+
+ err = sk->sk_prot->connect(sk, uaddr, addr_len);
+ if (err < 0)
+ goto out;
+
+ sock->state = SS_CONNECTING;
+
+ /* Just entered SS_CONNECTING state; the only
+ * difference is that return value in non-blocking
+ * case is EINPROGRESS, rather than EALREADY.
+ */
+ err = -EINPROGRESS;
+ break;
+ }
+
+ timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ /* Error code is set above */
+ if (!timeo || !inet_wait_for_connect(sk, timeo))
+ goto out;
+
+ err = sock_intr_errno(timeo);
+ if (signal_pending(current))
+ goto out;
+ }
+
+ /* Connection was closed by RST, timeout, ICMP error
+ * or another process disconnected us.
+ */
+ if (sk->sk_state == TCP_CLOSE)
+ goto sock_error;
+
+ /* sk->sk_err may be not zero now, if RECVERR was ordered by user
+ * and error was received after socket entered established state.
+ * Hence, it is handled normally after connect() return successfully.
+ */
+
+ sock->state = SS_CONNECTED;
+ err = 0;
+out:
+ release_sock(sk);
+ return err;
+
+sock_error:
+ err = sock_error(sk) ? : -ECONNABORTED;
+ sock->state = SS_UNCONNECTED;
+ if (sk->sk_prot->disconnect(sk, flags))
+ sock->state = SS_DISCONNECTING;
+ goto out;
+}
+
+/*
+ * Accept a pending connection. The TCP layer now gives BSD semantics.
+ */
+
+int inet_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+ struct sock *sk1 = sock->sk;
+ int err = -EINVAL;
+ struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);
+
+ if (!sk2)
+ goto do_err;
+
+ lock_sock(sk2);
+
+ BUG_TRAP((1 << sk2->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE));
+
+ sock_graft(sk2, newsock);
+
+ newsock->state = SS_CONNECTED;
+ err = 0;
+ release_sock(sk2);
+do_err:
+ return err;
+}
+
+
+/*
+ * This does both peername and sockname.
+ */
+int inet_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *uaddr_len, int peer)
+{
+ struct sock *sk = sock->sk;
+ struct inet_sock *inet = inet_sk(sk);
+ struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+
+ sin->sin_family = AF_INET;
+ if (peer) {
+ if (!inet->dport ||
+ (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
+ peer == 1))
+ return -ENOTCONN;
+ sin->sin_port = inet->dport;
+ sin->sin_addr.s_addr = inet->daddr;
+ } else {
+ __u32 addr = inet->rcv_saddr;
+ if (!addr)
+ addr = inet->saddr;
+ sin->sin_port = inet->sport;
+ sin->sin_addr.s_addr = addr;
+ }
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *uaddr_len = sizeof(*sin);
+ return 0;
+}
+
+int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+ size_t size)
+{
+ struct sock *sk = sock->sk;
+
+ /* We may need to bind the socket. */
+ if (!inet_sk(sk)->num && inet_autobind(sk))
+ return -EAGAIN;
+
+ return sk->sk_prot->sendmsg(iocb, sk, msg, size);
+}
+
+
+static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+
+ /* We may need to bind the socket. */
+ if (!inet_sk(sk)->num && inet_autobind(sk))
+ return -EAGAIN;
+
+ if (sk->sk_prot->sendpage)
+ return sk->sk_prot->sendpage(sk, page, offset, size, flags);
+ return sock_no_sendpage(sock, page, offset, size, flags);
+}
+
+
+int inet_shutdown(struct socket *sock, int how)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ /* This should really check to make sure
+ * the socket is a TCP socket. (WHY AC...)
+ */
+ how++; /* maps 0->1 has the advantage of making bit 1 rcvs and
+ 1->2 bit 2 snds.
+ 2->3 */
+ if ((how & ~SHUTDOWN_MASK) || !how) /* MAXINT->0 */
+ return -EINVAL;
+
+ lock_sock(sk);
+ if (sock->state == SS_CONNECTING) {
+ if ((1 << sk->sk_state) &
+ (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
+ sock->state = SS_DISCONNECTING;
+ else
+ sock->state = SS_CONNECTED;
+ }
+
+ switch (sk->sk_state) {
+ case TCP_CLOSE:
+ err = -ENOTCONN;
+ /* Hack to wake up other listeners, who can poll for
+ POLLHUP, even on eg. unconnected UDP sockets -- RR */
+ default:
+ sk->sk_shutdown |= how;
+ if (sk->sk_prot->shutdown)
+ sk->sk_prot->shutdown(sk, how);
+ break;
+
+ /* Remaining two branches are temporary solution for missing
+ * close() in multithreaded environment. It is _not_ a good idea,
+ * but we have no choice until close() is repaired at VFS level.
+ */
+ case TCP_LISTEN:
+ if (!(how & RCV_SHUTDOWN))
+ break;
+ /* Fall through */
+ case TCP_SYN_SENT:
+ err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
+ sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+ break;
+ }
+
+ /* Wake up anyone sleeping in poll. */
+ sk->sk_state_change(sk);
+ release_sock(sk);
+ return err;
+}
+
+/*
+ * ioctl() calls you can issue on an INET socket. Most of these are
+ * device configuration and stuff and very rarely used. Some ioctls
+ * pass on to the socket itself.
+ *
+ * NOTE: I like the idea of a module for the config stuff. ie ifconfig
+ * loads the devconfigure module does its configuring and unloads it.
+ * There's a good 20K of config code hanging around the kernel.
+ */
+
+int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ switch (cmd) {
+ case SIOCGSTAMP:
+ err = sock_get_timestamp(sk, (struct timeval __user *)arg);
+ break;
+ case SIOCADDRT:
+ case SIOCDELRT:
+ case SIOCRTMSG:
+ err = ip_rt_ioctl(cmd, (void __user *)arg);
+ break;
+ case SIOCDARP:
+ case SIOCGARP:
+ case SIOCSARP:
+ err = arp_ioctl(cmd, (void __user *)arg);
+ break;
+ case SIOCGIFADDR:
+ case SIOCSIFADDR:
+ case SIOCGIFBRDADDR:
+ case SIOCSIFBRDADDR:
+ case SIOCGIFNETMASK:
+ case SIOCSIFNETMASK:
+ case SIOCGIFDSTADDR:
+ case SIOCSIFDSTADDR:
+ case SIOCSIFPFLAGS:
+ case SIOCGIFPFLAGS:
+ case SIOCSIFFLAGS:
+ err = devinet_ioctl(cmd, (void __user *)arg);
+ break;
+ default:
+ if (!sk->sk_prot->ioctl ||
+ (err = sk->sk_prot->ioctl(sk, cmd, arg)) ==
+ -ENOIOCTLCMD)
+ err = dev_ioctl(cmd, (void __user *)arg);
+ break;
+ }
+ return err;
+}
+
+struct proto_ops inet_stream_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release,
+ .bind = inet_bind,
+ .connect = inet_stream_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = inet_accept,
+ .getname = inet_getname,
+ .poll = tcp_poll,
+ .ioctl = inet_ioctl,
+ .listen = inet_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = sock_common_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = tcp_sendpage
+};
+
+struct proto_ops inet_dgram_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release,
+ .bind = inet_bind,
+ .connect = inet_dgram_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = inet_getname,
+ .poll = udp_poll,
+ .ioctl = inet_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = sock_common_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = inet_sendpage,
+};
+
+/*
+ * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
+ * udp_poll
+ */
+static struct proto_ops inet_sockraw_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release,
+ .bind = inet_bind,
+ .connect = inet_dgram_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = inet_getname,
+ .poll = datagram_poll,
+ .ioctl = inet_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = sock_common_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = inet_sendpage,
+};
+
+static struct net_proto_family inet_family_ops = {
+ .family = PF_INET,
+ .create = inet_create,
+ .owner = THIS_MODULE,
+};
+
+
+extern void tcp_init(void);
+extern void tcp_v4_init(struct net_proto_family *);
+
+/* Upon startup we insert all the elements in inetsw_array[] into
+ * the linked list inetsw.
+ */
+static struct inet_protosw inetsw_array[] =
+{
+ {
+ .type = SOCK_STREAM,
+ .protocol = IPPROTO_TCP,
+ .prot = &tcp_prot,
+ .ops = &inet_stream_ops,
+ .capability = -1,
+ .no_check = 0,
+ .flags = INET_PROTOSW_PERMANENT,
+ },
+
+ {
+ .type = SOCK_DGRAM,
+ .protocol = IPPROTO_UDP,
+ .prot = &udp_prot,
+ .ops = &inet_dgram_ops,
+ .capability = -1,
+ .no_check = UDP_CSUM_DEFAULT,
+ .flags = INET_PROTOSW_PERMANENT,
+ },
+
+
+ {
+ .type = SOCK_RAW,
+ .protocol = IPPROTO_IP, /* wild card */
+ .prot = &raw_prot,
+ .ops = &inet_sockraw_ops,
+ .capability = CAP_NET_RAW,
+ .no_check = UDP_CSUM_DEFAULT,
+ .flags = INET_PROTOSW_REUSE,
+ }
+};
+
+#define INETSW_ARRAY_LEN (sizeof(inetsw_array) / sizeof(struct inet_protosw))
+
+void inet_register_protosw(struct inet_protosw *p)
+{
+ struct list_head *lh;
+ struct inet_protosw *answer;
+ int protocol = p->protocol;
+ struct list_head *last_perm;
+
+ spin_lock_bh(&inetsw_lock);
+
+ if (p->type >= SOCK_MAX)
+ goto out_illegal;
+
+ /* If we are trying to override a permanent protocol, bail. */
+ answer = NULL;
+ last_perm = &inetsw[p->type];
+ list_for_each(lh, &inetsw[p->type]) {
+ answer = list_entry(lh, struct inet_protosw, list);
+
+ /* Check only the non-wild match. */
+ if (INET_PROTOSW_PERMANENT & answer->flags) {
+ if (protocol == answer->protocol)
+ break;
+ last_perm = lh;
+ }
+
+ answer = NULL;
+ }
+ if (answer)
+ goto out_permanent;
+
+ /* Add the new entry after the last permanent entry if any, so that
+ * the new entry does not override a permanent entry when matched with
+ * a wild-card protocol. But it is allowed to override any existing
+ * non-permanent entry. This means that when we remove this entry, the
+ * system automatically returns to the old behavior.
+ */
+ list_add_rcu(&p->list, last_perm);
+out:
+ spin_unlock_bh(&inetsw_lock);
+
+ synchronize_net();
+
+ return;
+
+out_permanent:
+ printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
+ protocol);
+ goto out;
+
+out_illegal:
+ printk(KERN_ERR
+ "Ignoring attempt to register invalid socket type %d.\n",
+ p->type);
+ goto out;
+}
+
+void inet_unregister_protosw(struct inet_protosw *p)
+{
+ if (INET_PROTOSW_PERMANENT & p->flags) {
+ printk(KERN_ERR
+ "Attempt to unregister permanent protocol %d.\n",
+ p->protocol);
+ } else {
+ spin_lock_bh(&inetsw_lock);
+ list_del_rcu(&p->list);
+ spin_unlock_bh(&inetsw_lock);
+
+ synchronize_net();
+ }
+}
+
+#ifdef CONFIG_IP_MULTICAST
+static struct net_protocol igmp_protocol = {
+ .handler = igmp_rcv,
+};
+#endif
+
+static struct net_protocol tcp_protocol = {
+ .handler = tcp_v4_rcv,
+ .err_handler = tcp_v4_err,
+ .no_policy = 1,
+};
+
+static struct net_protocol udp_protocol = {
+ .handler = udp_rcv,
+ .err_handler = udp_err,
+ .no_policy = 1,
+};
+
+static struct net_protocol icmp_protocol = {
+ .handler = icmp_rcv,
+};
+
+static int __init init_ipv4_mibs(void)
+{
+ net_statistics[0] = alloc_percpu(struct linux_mib);
+ net_statistics[1] = alloc_percpu(struct linux_mib);
+ ip_statistics[0] = alloc_percpu(struct ipstats_mib);
+ ip_statistics[1] = alloc_percpu(struct ipstats_mib);
+ icmp_statistics[0] = alloc_percpu(struct icmp_mib);
+ icmp_statistics[1] = alloc_percpu(struct icmp_mib);
+ tcp_statistics[0] = alloc_percpu(struct tcp_mib);
+ tcp_statistics[1] = alloc_percpu(struct tcp_mib);
+ udp_statistics[0] = alloc_percpu(struct udp_mib);
+ udp_statistics[1] = alloc_percpu(struct udp_mib);
+ if (!
+ (net_statistics[0] && net_statistics[1] && ip_statistics[0]
+ && ip_statistics[1] && tcp_statistics[0] && tcp_statistics[1]
+ && udp_statistics[0] && udp_statistics[1]))
+ return -ENOMEM;
+
+ (void) tcp_mib_init();
+
+ return 0;
+}
+
+static int ipv4_proc_init(void);
+extern void ipfrag_init(void);
+
+static int __init inet_init(void)
+{
+ struct sk_buff *dummy_skb;
+ struct inet_protosw *q;
+ struct list_head *r;
+ int rc = -EINVAL;
+
+ if (sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)) {
+ printk(KERN_CRIT "%s: panic\n", __FUNCTION__);
+ goto out;
+ }
+
+ rc = proto_register(&tcp_prot, 1);
+ if (rc)
+ goto out;
+
+ rc = proto_register(&udp_prot, 1);
+ if (rc)
+ goto out_unregister_tcp_proto;
+
+ rc = proto_register(&raw_prot, 1);
+ if (rc)
+ goto out_unregister_udp_proto;
+
+ /*
+ * Tell SOCKET that we are alive...
+ */
+
+ (void)sock_register(&inet_family_ops);
+
+ /*
+ * Add all the base protocols.
+ */
+
+ if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
+ printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
+ if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
+ printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
+ if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
+ printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
+#ifdef CONFIG_IP_MULTICAST
+ if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
+ printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n");
+#endif
+
+ /* Register the socket-side information for inet_create. */
+ for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
+ INIT_LIST_HEAD(r);
+
+ for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
+ inet_register_protosw(q);
+
+ /*
+ * Set the ARP module up
+ */
+
+ arp_init();
+
+ /*
+ * Set the IP module up
+ */
+
+ ip_init();
+
+ tcp_v4_init(&inet_family_ops);
+
+ /* Setup TCP slab cache for open requests. */
+ tcp_init();
+
+
+ /*
+ * Set the ICMP layer up
+ */
+
+ icmp_init(&inet_family_ops);
+
+ /*
+ * Initialise the multicast router
+ */
+#if defined(CONFIG_IP_MROUTE)
+ ip_mr_init();
+#endif
+ /*
+ * Initialise per-cpu ipv4 mibs
+ */
+
+ if(init_ipv4_mibs())
+ printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); ;
+
+ ipv4_proc_init();
+
+ ipfrag_init();
+
+ rc = 0;
+out:
+ return rc;
+out_unregister_tcp_proto:
+ proto_unregister(&tcp_prot);
+out_unregister_udp_proto:
+ proto_unregister(&udp_prot);
+ goto out;
+}
+
+module_init(inet_init);
+
+/* ------------------------------------------------------------------------ */
+
+#ifdef CONFIG_PROC_FS
+extern int fib_proc_init(void);
+extern void fib_proc_exit(void);
+extern int ip_misc_proc_init(void);
+extern int raw_proc_init(void);
+extern void raw_proc_exit(void);
+extern int tcp4_proc_init(void);
+extern void tcp4_proc_exit(void);
+extern int udp4_proc_init(void);
+extern void udp4_proc_exit(void);
+
+static int __init ipv4_proc_init(void)
+{
+ int rc = 0;
+
+ if (raw_proc_init())
+ goto out_raw;
+ if (tcp4_proc_init())
+ goto out_tcp;
+ if (udp4_proc_init())
+ goto out_udp;
+ if (fib_proc_init())
+ goto out_fib;
+ if (ip_misc_proc_init())
+ goto out_misc;
+out:
+ return rc;
+out_misc:
+ fib_proc_exit();
+out_fib:
+ udp4_proc_exit();
+out_udp:
+ tcp4_proc_exit();
+out_tcp:
+ raw_proc_exit();
+out_raw:
+ rc = -ENOMEM;
+ goto out;
+}
+
+#else /* CONFIG_PROC_FS */
+static int __init ipv4_proc_init(void)
+{
+ return 0;
+}
+#endif /* CONFIG_PROC_FS */
+
+MODULE_ALIAS_NETPROTO(PF_INET);
+
+EXPORT_SYMBOL(inet_accept);
+EXPORT_SYMBOL(inet_bind);
+EXPORT_SYMBOL(inet_dgram_connect);
+EXPORT_SYMBOL(inet_dgram_ops);
+EXPORT_SYMBOL(inet_getname);
+EXPORT_SYMBOL(inet_ioctl);
+EXPORT_SYMBOL(inet_listen);
+EXPORT_SYMBOL(inet_register_protosw);
+EXPORT_SYMBOL(inet_release);
+EXPORT_SYMBOL(inet_sendmsg);
+EXPORT_SYMBOL(inet_shutdown);
+EXPORT_SYMBOL(inet_sock_destruct);
+EXPORT_SYMBOL(inet_stream_connect);
+EXPORT_SYMBOL(inet_stream_ops);
+EXPORT_SYMBOL(inet_unregister_protosw);
+EXPORT_SYMBOL(net_statistics);
+
+#ifdef INET_REFCNT_DEBUG
+EXPORT_SYMBOL(inet_sock_nr);
+#endif
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
new file mode 100644
index 000000000000..0e98f2235b6e
--- /dev/null
+++ b/net/ipv4/ah4.c
@@ -0,0 +1,335 @@
+#include <linux/config.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/ah.h>
+#include <linux/crypto.h>
+#include <linux/pfkeyv2.h>
+#include <net/icmp.h>
+#include <asm/scatterlist.h>
+
+
+/* Clear mutable options and find final destination to substitute
+ * into IP header for icv calculation. Options are already checked
+ * for validity, so paranoia is not required. */
+
+static int ip_clear_mutable_options(struct iphdr *iph, u32 *daddr)
+{
+ unsigned char * optptr = (unsigned char*)(iph+1);
+ int l = iph->ihl*4 - sizeof(struct iphdr);
+ int optlen;
+
+ while (l > 0) {
+ switch (*optptr) {
+ case IPOPT_END:
+ return 0;
+ case IPOPT_NOOP:
+ l--;
+ optptr++;
+ continue;
+ }
+ optlen = optptr[1];
+ if (optlen<2 || optlen>l)
+ return -EINVAL;
+ switch (*optptr) {
+ case IPOPT_SEC:
+ case 0x85: /* Some "Extended Security" crap. */
+ case 0x86: /* Another "Commercial Security" crap. */
+ case IPOPT_RA:
+ case 0x80|21: /* RFC1770 */
+ break;
+ case IPOPT_LSRR:
+ case IPOPT_SSRR:
+ if (optlen < 6)
+ return -EINVAL;
+ memcpy(daddr, optptr+optlen-4, 4);
+ /* Fall through */
+ default:
+ memset(optptr+2, 0, optlen-2);
+ }
+ l -= optlen;
+ optptr += optlen;
+ }
+ return 0;
+}
+
+static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+ struct iphdr *iph, *top_iph;
+ struct ip_auth_hdr *ah;
+ struct ah_data *ahp;
+ union {
+ struct iphdr iph;
+ char buf[60];
+ } tmp_iph;
+
+ top_iph = skb->nh.iph;
+ iph = &tmp_iph.iph;
+
+ iph->tos = top_iph->tos;
+ iph->ttl = top_iph->ttl;
+ iph->frag_off = top_iph->frag_off;
+
+ if (top_iph->ihl != 5) {
+ iph->daddr = top_iph->daddr;
+ memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+ err = ip_clear_mutable_options(top_iph, &top_iph->daddr);
+ if (err)
+ goto error;
+ }
+
+ ah = (struct ip_auth_hdr *)((char *)top_iph+top_iph->ihl*4);
+ ah->nexthdr = top_iph->protocol;
+
+ top_iph->tos = 0;
+ top_iph->tot_len = htons(skb->len);
+ top_iph->frag_off = 0;
+ top_iph->ttl = 0;
+ top_iph->protocol = IPPROTO_AH;
+ top_iph->check = 0;
+
+ ahp = x->data;
+ ah->hdrlen = (XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
+ ahp->icv_trunc_len) >> 2) - 2;
+
+ ah->reserved = 0;
+ ah->spi = x->id.spi;
+ ah->seq_no = htonl(++x->replay.oseq);
+ ahp->icv(ahp, skb, ah->auth_data);
+
+ top_iph->tos = iph->tos;
+ top_iph->ttl = iph->ttl;
+ top_iph->frag_off = iph->frag_off;
+ if (top_iph->ihl != 5) {
+ top_iph->daddr = iph->daddr;
+ memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+ }
+
+ ip_send_check(top_iph);
+
+ err = 0;
+
+error:
+ return err;
+}
+
+static int ah_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
+{
+ int ah_hlen;
+ struct iphdr *iph;
+ struct ip_auth_hdr *ah;
+ struct ah_data *ahp;
+ char work_buf[60];
+
+ if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr)))
+ goto out;
+
+ ah = (struct ip_auth_hdr*)skb->data;
+ ahp = x->data;
+ ah_hlen = (ah->hdrlen + 2) << 2;
+
+ if (ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_full_len) &&
+ ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len))
+ goto out;
+
+ if (!pskb_may_pull(skb, ah_hlen))
+ goto out;
+
+ /* We are going to _remove_ AH header to keep sockets happy,
+ * so... Later this can change. */
+ if (skb_cloned(skb) &&
+ pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ goto out;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ ah = (struct ip_auth_hdr*)skb->data;
+ iph = skb->nh.iph;
+
+ memcpy(work_buf, iph, iph->ihl*4);
+
+ iph->ttl = 0;
+ iph->tos = 0;
+ iph->frag_off = 0;
+ iph->check = 0;
+ if (iph->ihl != 5) {
+ u32 dummy;
+ if (ip_clear_mutable_options(iph, &dummy))
+ goto out;
+ }
+ {
+ u8 auth_data[MAX_AH_AUTH_LEN];
+
+ memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
+ skb_push(skb, skb->data - skb->nh.raw);
+ ahp->icv(ahp, skb, ah->auth_data);
+ if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) {
+ x->stats.integrity_failed++;
+ goto out;
+ }
+ }
+ ((struct iphdr*)work_buf)->protocol = ah->nexthdr;
+ skb->nh.raw = skb_pull(skb, ah_hlen);
+ memcpy(skb->nh.raw, work_buf, iph->ihl*4);
+ skb->nh.iph->tot_len = htons(skb->len);
+ skb_pull(skb, skb->nh.iph->ihl*4);
+ skb->h.raw = skb->data;
+
+ return 0;
+
+out:
+ return -EINVAL;
+}
+
+static void ah4_err(struct sk_buff *skb, u32 info)
+{
+ struct iphdr *iph = (struct iphdr*)skb->data;
+ struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+(iph->ihl<<2));
+ struct xfrm_state *x;
+
+ if (skb->h.icmph->type != ICMP_DEST_UNREACH ||
+ skb->h.icmph->code != ICMP_FRAG_NEEDED)
+ return;
+
+ x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET);
+ if (!x)
+ return;
+ printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n",
+ ntohl(ah->spi), ntohl(iph->daddr));
+ xfrm_state_put(x);
+}
+
+static int ah_init_state(struct xfrm_state *x, void *args)
+{
+ struct ah_data *ahp = NULL;
+ struct xfrm_algo_desc *aalg_desc;
+
+ if (!x->aalg)
+ goto error;
+
+ /* null auth can use a zero length key */
+ if (x->aalg->alg_key_len > 512)
+ goto error;
+
+ if (x->encap)
+ goto error;
+
+ ahp = kmalloc(sizeof(*ahp), GFP_KERNEL);
+ if (ahp == NULL)
+ return -ENOMEM;
+
+ memset(ahp, 0, sizeof(*ahp));
+
+ ahp->key = x->aalg->alg_key;
+ ahp->key_len = (x->aalg->alg_key_len+7)/8;
+ ahp->tfm = crypto_alloc_tfm(x->aalg->alg_name, 0);
+ if (!ahp->tfm)
+ goto error;
+ ahp->icv = ah_hmac_digest;
+
+ /*
+ * Lookup the algorithm description maintained by xfrm_algo,
+ * verify crypto transform properties, and store information
+ * we need for AH processing. This lookup cannot fail here
+ * after a successful crypto_alloc_tfm().
+ */
+ aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+ BUG_ON(!aalg_desc);
+
+ if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
+ crypto_tfm_alg_digestsize(ahp->tfm)) {
+ printk(KERN_INFO "AH: %s digestsize %u != %hu\n",
+ x->aalg->alg_name, crypto_tfm_alg_digestsize(ahp->tfm),
+ aalg_desc->uinfo.auth.icv_fullbits/8);
+ goto error;
+ }
+
+ ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
+ ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8;
+
+ BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
+
+ ahp->work_icv = kmalloc(ahp->icv_full_len, GFP_KERNEL);
+ if (!ahp->work_icv)
+ goto error;
+
+ x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len);
+ if (x->props.mode)
+ x->props.header_len += sizeof(struct iphdr);
+ x->data = ahp;
+
+ return 0;
+
+error:
+ if (ahp) {
+ if (ahp->work_icv)
+ kfree(ahp->work_icv);
+ if (ahp->tfm)
+ crypto_free_tfm(ahp->tfm);
+ kfree(ahp);
+ }
+ return -EINVAL;
+}
+
+static void ah_destroy(struct xfrm_state *x)
+{
+ struct ah_data *ahp = x->data;
+
+ if (!ahp)
+ return;
+
+ if (ahp->work_icv) {
+ kfree(ahp->work_icv);
+ ahp->work_icv = NULL;
+ }
+ if (ahp->tfm) {
+ crypto_free_tfm(ahp->tfm);
+ ahp->tfm = NULL;
+ }
+ kfree(ahp);
+}
+
+
+static struct xfrm_type ah_type =
+{
+ .description = "AH4",
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_AH,
+ .init_state = ah_init_state,
+ .destructor = ah_destroy,
+ .input = ah_input,
+ .output = ah_output
+};
+
+static struct net_protocol ah4_protocol = {
+ .handler = xfrm4_rcv,
+ .err_handler = ah4_err,
+ .no_policy = 1,
+};
+
+static int __init ah4_init(void)
+{
+ if (xfrm_register_type(&ah_type, AF_INET) < 0) {
+ printk(KERN_INFO "ip ah init: can't add xfrm type\n");
+ return -EAGAIN;
+ }
+ if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) {
+ printk(KERN_INFO "ip ah init: can't add protocol\n");
+ xfrm_unregister_type(&ah_type, AF_INET);
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static void __exit ah4_fini(void)
+{
+ if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0)
+ printk(KERN_INFO "ip ah close: can't remove protocol\n");
+ if (xfrm_unregister_type(&ah_type, AF_INET) < 0)
+ printk(KERN_INFO "ip ah close: can't remove xfrm type\n");
+}
+
+module_init(ah4_init);
+module_exit(ah4_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
new file mode 100644
index 000000000000..a642fd612853
--- /dev/null
+++ b/net/ipv4/arp.c
@@ -0,0 +1,1425 @@
+/* linux/net/inet/arp.c
+ *
+ * Version: $Id: arp.c,v 1.99 2001/08/30 22:55:42 davem Exp $
+ *
+ * Copyright (C) 1994 by Florian La Roche
+ *
+ * This module implements the Address Resolution Protocol ARP (RFC 826),
+ * which is used to convert IP addresses (or in the future maybe other
+ * high-level addresses) into a low-level hardware address (like an Ethernet
+ * address).
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Alan Cox : Removed the Ethernet assumptions in
+ * Florian's code
+ * Alan Cox : Fixed some small errors in the ARP
+ * logic
+ * Alan Cox : Allow >4K in /proc
+ * Alan Cox : Make ARP add its own protocol entry
+ * Ross Martin : Rewrote arp_rcv() and arp_get_info()
+ * Stephen Henson : Add AX25 support to arp_get_info()
+ * Alan Cox : Drop data when a device is downed.
+ * Alan Cox : Use init_timer().
+ * Alan Cox : Double lock fixes.
+ * Martin Seine : Move the arphdr structure
+ * to if_arp.h for compatibility.
+ * with BSD based programs.
+ * Andrew Tridgell : Added ARP netmask code and
+ * re-arranged proxy handling.
+ * Alan Cox : Changed to use notifiers.
+ * Niibe Yutaka : Reply for this device or proxies only.
+ * Alan Cox : Don't proxy across hardware types!
+ * Jonathan Naylor : Added support for NET/ROM.
+ * Mike Shaver : RFC1122 checks.
+ * Jonathan Naylor : Only lookup the hardware address for
+ * the correct hardware type.
+ * Germano Caronni : Assorted subtle races.
+ * Craig Schlenter : Don't modify permanent entry
+ * during arp_rcv.
+ * Russ Nelson : Tidied up a few bits.
+ * Alexey Kuznetsov: Major changes to caching and behaviour,
+ * eg intelligent arp probing and
+ * generation
+ * of host down events.
+ * Alan Cox : Missing unlock in device events.
+ * Eckes : ARP ioctl control errors.
+ * Alexey Kuznetsov: Arp free fix.
+ * Manuel Rodriguez: Gratuitous ARP.
+ * Jonathan Layes : Added arpd support through kerneld
+ * message queue (960314)
+ * Mike Shaver : /proc/sys/net/ipv4/arp_* support
+ * Mike McLagan : Routing by source
+ * Stuart Cheshire : Metricom and grat arp fixes
+ * *** FOR 2.1 clean this up ***
+ * Lawrence V. Stefani: (08/12/96) Added FDDI support.
+ * Alan Cox : Took the AP1000 nasty FDDI hack and
+ * folded into the mainstream FDDI code.
+ * Ack spit, Linus how did you allow that
+ * one in...
+ * Jes Sorensen : Make FDDI work again in 2.1.x and
+ * clean up the APFDDI & gen. FDDI bits.
+ * Alexey Kuznetsov: new arp state machine;
+ * now it is in net/core/neighbour.c.
+ * Krzysztof Halasa: Added Frame Relay ARP support.
+ * Arnaldo C. Melo : convert /proc/net/arp to seq_file
+ * Shmulik Hen: Split arp_send to arp_create and
+ * arp_xmit so intermediate drivers like
+ * bonding can change the skb before
+ * sending (e.g. insert 8021q tag).
+ * Harald Welte : convert to make use of jenkins hash
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/config.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/fddidevice.h>
+#include <linux/if_arp.h>
+#include <linux/trdevice.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/jhash.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+#include <net/ax25.h>
+#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+#include <net/netrom.h>
+#endif
+#endif
+#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
+#include <net/atmclip.h>
+struct neigh_table *clip_tbl_hook;
+#endif
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/netfilter_arp.h>
+
+/*
+ * Interface to generic neighbour cache.
+ */
+static u32 arp_hash(const void *pkey, const struct net_device *dev);
+static int arp_constructor(struct neighbour *neigh);
+static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
+static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
+static void parp_redo(struct sk_buff *skb);
+
+static struct neigh_ops arp_generic_ops = {
+ .family = AF_INET,
+ .solicit = arp_solicit,
+ .error_report = arp_error_report,
+ .output = neigh_resolve_output,
+ .connected_output = neigh_connected_output,
+ .hh_output = dev_queue_xmit,
+ .queue_xmit = dev_queue_xmit,
+};
+
+static struct neigh_ops arp_hh_ops = {
+ .family = AF_INET,
+ .solicit = arp_solicit,
+ .error_report = arp_error_report,
+ .output = neigh_resolve_output,
+ .connected_output = neigh_resolve_output,
+ .hh_output = dev_queue_xmit,
+ .queue_xmit = dev_queue_xmit,
+};
+
+static struct neigh_ops arp_direct_ops = {
+ .family = AF_INET,
+ .output = dev_queue_xmit,
+ .connected_output = dev_queue_xmit,
+ .hh_output = dev_queue_xmit,
+ .queue_xmit = dev_queue_xmit,
+};
+
+struct neigh_ops arp_broken_ops = {
+ .family = AF_INET,
+ .solicit = arp_solicit,
+ .error_report = arp_error_report,
+ .output = neigh_compat_output,
+ .connected_output = neigh_compat_output,
+ .hh_output = dev_queue_xmit,
+ .queue_xmit = dev_queue_xmit,
+};
+
+struct neigh_table arp_tbl = {
+ .family = AF_INET,
+ .entry_size = sizeof(struct neighbour) + 4,
+ .key_len = 4,
+ .hash = arp_hash,
+ .constructor = arp_constructor,
+ .proxy_redo = parp_redo,
+ .id = "arp_cache",
+ .parms = {
+ .tbl = &arp_tbl,
+ .base_reachable_time = 30 * HZ,
+ .retrans_time = 1 * HZ,
+ .gc_staletime = 60 * HZ,
+ .reachable_time = 30 * HZ,
+ .delay_probe_time = 5 * HZ,
+ .queue_len = 3,
+ .ucast_probes = 3,
+ .mcast_probes = 3,
+ .anycast_delay = 1 * HZ,
+ .proxy_delay = (8 * HZ) / 10,
+ .proxy_qlen = 64,
+ .locktime = 1 * HZ,
+ },
+ .gc_interval = 30 * HZ,
+ .gc_thresh1 = 128,
+ .gc_thresh2 = 512,
+ .gc_thresh3 = 1024,
+};
+
+int arp_mc_map(u32 addr, u8 *haddr, struct net_device *dev, int dir)
+{
+ switch (dev->type) {
+ case ARPHRD_ETHER:
+ case ARPHRD_FDDI:
+ case ARPHRD_IEEE802:
+ ip_eth_mc_map(addr, haddr);
+ return 0;
+ case ARPHRD_IEEE802_TR:
+ ip_tr_mc_map(addr, haddr);
+ return 0;
+ case ARPHRD_INFINIBAND:
+ ip_ib_mc_map(addr, haddr);
+ return 0;
+ default:
+ if (dir) {
+ memcpy(haddr, dev->broadcast, dev->addr_len);
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+
+static u32 arp_hash(const void *pkey, const struct net_device *dev)
+{
+ return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd);
+}
+
+static int arp_constructor(struct neighbour *neigh)
+{
+ u32 addr = *(u32*)neigh->primary_key;
+ struct net_device *dev = neigh->dev;
+ struct in_device *in_dev;
+ struct neigh_parms *parms;
+
+ neigh->type = inet_addr_type(addr);
+
+ rcu_read_lock();
+ in_dev = rcu_dereference(__in_dev_get(dev));
+ if (in_dev == NULL) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ parms = in_dev->arp_parms;
+ __neigh_parms_put(neigh->parms);
+ neigh->parms = neigh_parms_clone(parms);
+ rcu_read_unlock();
+
+ if (dev->hard_header == NULL) {
+ neigh->nud_state = NUD_NOARP;
+ neigh->ops = &arp_direct_ops;
+ neigh->output = neigh->ops->queue_xmit;
+ } else {
+ /* Good devices (checked by reading texts, but only Ethernet is
+ tested)
+
+ ARPHRD_ETHER: (ethernet, apfddi)
+ ARPHRD_FDDI: (fddi)
+ ARPHRD_IEEE802: (tr)
+ ARPHRD_METRICOM: (strip)
+ ARPHRD_ARCNET:
+ etc. etc. etc.
+
+ ARPHRD_IPDDP will also work, if author repairs it.
+ I did not it, because this driver does not work even
+ in old paradigm.
+ */
+
+#if 1
+ /* So... these "amateur" devices are hopeless.
+ The only thing, that I can say now:
+ It is very sad that we need to keep ugly obsolete
+ code to make them happy.
+
+ They should be moved to more reasonable state, now
+ they use rebuild_header INSTEAD OF hard_start_xmit!!!
+ Besides that, they are sort of out of date
+ (a lot of redundant clones/copies, useless in 2.1),
+ I wonder why people believe that they work.
+ */
+ switch (dev->type) {
+ default:
+ break;
+ case ARPHRD_ROSE:
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+ case ARPHRD_AX25:
+#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+ case ARPHRD_NETROM:
+#endif
+ neigh->ops = &arp_broken_ops;
+ neigh->output = neigh->ops->output;
+ return 0;
+#endif
+ ;}
+#endif
+ if (neigh->type == RTN_MULTICAST) {
+ neigh->nud_state = NUD_NOARP;
+ arp_mc_map(addr, neigh->ha, dev, 1);
+ } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) {
+ neigh->nud_state = NUD_NOARP;
+ memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
+ } else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) {
+ neigh->nud_state = NUD_NOARP;
+ memcpy(neigh->ha, dev->broadcast, dev->addr_len);
+ }
+ if (dev->hard_header_cache)
+ neigh->ops = &arp_hh_ops;
+ else
+ neigh->ops = &arp_generic_ops;
+ if (neigh->nud_state&NUD_VALID)
+ neigh->output = neigh->ops->connected_output;
+ else
+ neigh->output = neigh->ops->output;
+ }
+ return 0;
+}
+
+static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
+{
+ dst_link_failure(skb);
+ kfree_skb(skb);
+}
+
+static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
+{
+ u32 saddr = 0;
+ u8 *dst_ha = NULL;
+ struct net_device *dev = neigh->dev;
+ u32 target = *(u32*)neigh->primary_key;
+ int probes = atomic_read(&neigh->probes);
+ struct in_device *in_dev = in_dev_get(dev);
+
+ if (!in_dev)
+ return;
+
+ switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
+ default:
+ case 0: /* By default announce any local IP */
+ if (skb && inet_addr_type(skb->nh.iph->saddr) == RTN_LOCAL)
+ saddr = skb->nh.iph->saddr;
+ break;
+ case 1: /* Restrict announcements of saddr in same subnet */
+ if (!skb)
+ break;
+ saddr = skb->nh.iph->saddr;
+ if (inet_addr_type(saddr) == RTN_LOCAL) {
+ /* saddr should be known to target */
+ if (inet_addr_onlink(in_dev, target, saddr))
+ break;
+ }
+ saddr = 0;
+ break;
+ case 2: /* Avoid secondary IPs, get a primary/preferred one */
+ break;
+ }
+
+ if (in_dev)
+ in_dev_put(in_dev);
+ if (!saddr)
+ saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
+
+ if ((probes -= neigh->parms->ucast_probes) < 0) {
+ if (!(neigh->nud_state&NUD_VALID))
+ printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n");
+ dst_ha = neigh->ha;
+ read_lock_bh(&neigh->lock);
+ } else if ((probes -= neigh->parms->app_probes) < 0) {
+#ifdef CONFIG_ARPD
+ neigh_app_ns(neigh);
+#endif
+ return;
+ }
+
+ arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
+ dst_ha, dev->dev_addr, NULL);
+ if (dst_ha)
+ read_unlock_bh(&neigh->lock);
+}
+
+static int arp_ignore(struct in_device *in_dev, struct net_device *dev,
+ u32 sip, u32 tip)
+{
+ int scope;
+
+ switch (IN_DEV_ARP_IGNORE(in_dev)) {
+ case 0: /* Reply, the tip is already validated */
+ return 0;
+ case 1: /* Reply only if tip is configured on the incoming interface */
+ sip = 0;
+ scope = RT_SCOPE_HOST;
+ break;
+ case 2: /*
+ * Reply only if tip is configured on the incoming interface
+ * and is in same subnet as sip
+ */
+ scope = RT_SCOPE_HOST;
+ break;
+ case 3: /* Do not reply for scope host addresses */
+ sip = 0;
+ scope = RT_SCOPE_LINK;
+ dev = NULL;
+ break;
+ case 4: /* Reserved */
+ case 5:
+ case 6:
+ case 7:
+ return 0;
+ case 8: /* Do not reply */
+ return 1;
+ default:
+ return 0;
+ }
+ return !inet_confirm_addr(dev, sip, tip, scope);
+}
+
+static int arp_filter(__u32 sip, __u32 tip, struct net_device *dev)
+{
+ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = sip,
+ .saddr = tip } } };
+ struct rtable *rt;
+ int flag = 0;
+ /*unsigned long now; */
+
+ if (ip_route_output_key(&rt, &fl) < 0)
+ return 1;
+ if (rt->u.dst.dev != dev) {
+ NET_INC_STATS_BH(LINUX_MIB_ARPFILTER);
+ flag = 1;
+ }
+ ip_rt_put(rt);
+ return flag;
+}
+
+/* OBSOLETE FUNCTIONS */
+
+/*
+ * Find an arp mapping in the cache. If not found, post a request.
+ *
+ * It is very UGLY routine: it DOES NOT use skb->dst->neighbour,
+ * even if it exists. It is supposed that skb->dev was mangled
+ * by a virtual device (eql, shaper). Nobody but broken devices
+ * is allowed to use this function, it is scheduled to be removed. --ANK
+ */
+
+static int arp_set_predefined(int addr_hint, unsigned char * haddr, u32 paddr, struct net_device * dev)
+{
+ switch (addr_hint) {
+ case RTN_LOCAL:
+ printk(KERN_DEBUG "ARP: arp called for own IP address\n");
+ memcpy(haddr, dev->dev_addr, dev->addr_len);
+ return 1;
+ case RTN_MULTICAST:
+ arp_mc_map(paddr, haddr, dev, 1);
+ return 1;
+ case RTN_BROADCAST:
+ memcpy(haddr, dev->broadcast, dev->addr_len);
+ return 1;
+ }
+ return 0;
+}
+
+
+int arp_find(unsigned char *haddr, struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ u32 paddr;
+ struct neighbour *n;
+
+ if (!skb->dst) {
+ printk(KERN_DEBUG "arp_find is called with dst==NULL\n");
+ kfree_skb(skb);
+ return 1;
+ }
+
+ paddr = ((struct rtable*)skb->dst)->rt_gateway;
+
+ if (arp_set_predefined(inet_addr_type(paddr), haddr, paddr, dev))
+ return 0;
+
+ n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
+
+ if (n) {
+ n->used = jiffies;
+ if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) {
+ read_lock_bh(&n->lock);
+ memcpy(haddr, n->ha, dev->addr_len);
+ read_unlock_bh(&n->lock);
+ neigh_release(n);
+ return 0;
+ }
+ neigh_release(n);
+ } else
+ kfree_skb(skb);
+ return 1;
+}
+
+/* END OF OBSOLETE FUNCTIONS */
+
+int arp_bind_neighbour(struct dst_entry *dst)
+{
+ struct net_device *dev = dst->dev;
+ struct neighbour *n = dst->neighbour;
+
+ if (dev == NULL)
+ return -EINVAL;
+ if (n == NULL) {
+ u32 nexthop = ((struct rtable*)dst)->rt_gateway;
+ if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT))
+ nexthop = 0;
+ n = __neigh_lookup_errno(
+#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
+ dev->type == ARPHRD_ATM ? clip_tbl_hook :
+#endif
+ &arp_tbl, &nexthop, dev);
+ if (IS_ERR(n))
+ return PTR_ERR(n);
+ dst->neighbour = n;
+ }
+ return 0;
+}
+
+/*
+ * Check if we can use proxy ARP for this path
+ */
+
+static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt)
+{
+ struct in_device *out_dev;
+ int imi, omi = -1;
+
+ if (!IN_DEV_PROXY_ARP(in_dev))
+ return 0;
+
+ if ((imi = IN_DEV_MEDIUM_ID(in_dev)) == 0)
+ return 1;
+ if (imi == -1)
+ return 0;
+
+ /* place to check for proxy_arp for routes */
+
+ if ((out_dev = in_dev_get(rt->u.dst.dev)) != NULL) {
+ omi = IN_DEV_MEDIUM_ID(out_dev);
+ in_dev_put(out_dev);
+ }
+ return (omi != imi && omi != -1);
+}
+
+/*
+ * Interface to link layer: send routine and receive handler.
+ */
+
+/*
+ * Create an arp packet. If (dest_hw == NULL), we create a broadcast
+ * message.
+ */
+struct sk_buff *arp_create(int type, int ptype, u32 dest_ip,
+ struct net_device *dev, u32 src_ip,
+ unsigned char *dest_hw, unsigned char *src_hw,
+ unsigned char *target_hw)
+{
+ struct sk_buff *skb;
+ struct arphdr *arp;
+ unsigned char *arp_ptr;
+
+ /*
+ * Allocate a buffer
+ */
+
+ skb = alloc_skb(sizeof(struct arphdr)+ 2*(dev->addr_len+4)
+ + LL_RESERVED_SPACE(dev), GFP_ATOMIC);
+ if (skb == NULL)
+ return NULL;
+
+ skb_reserve(skb, LL_RESERVED_SPACE(dev));
+ skb->nh.raw = skb->data;
+ arp = (struct arphdr *) skb_put(skb,sizeof(struct arphdr) + 2*(dev->addr_len+4));
+ skb->dev = dev;
+ skb->protocol = htons(ETH_P_ARP);
+ if (src_hw == NULL)
+ src_hw = dev->dev_addr;
+ if (dest_hw == NULL)
+ dest_hw = dev->broadcast;
+
+ /*
+ * Fill the device header for the ARP frame
+ */
+ if (dev->hard_header &&
+ dev->hard_header(skb,dev,ptype,dest_hw,src_hw,skb->len) < 0)
+ goto out;
+
+ /*
+ * Fill out the arp protocol part.
+ *
+ * The arp hardware type should match the device type, except for FDDI,
+ * which (according to RFC 1390) should always equal 1 (Ethernet).
+ */
+ /*
+ * Exceptions everywhere. AX.25 uses the AX.25 PID value not the
+ * DIX code for the protocol. Make these device structure fields.
+ */
+ switch (dev->type) {
+ default:
+ arp->ar_hrd = htons(dev->type);
+ arp->ar_pro = htons(ETH_P_IP);
+ break;
+
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+ case ARPHRD_AX25:
+ arp->ar_hrd = htons(ARPHRD_AX25);
+ arp->ar_pro = htons(AX25_P_IP);
+ break;
+
+#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+ case ARPHRD_NETROM:
+ arp->ar_hrd = htons(ARPHRD_NETROM);
+ arp->ar_pro = htons(AX25_P_IP);
+ break;
+#endif
+#endif
+
+#ifdef CONFIG_FDDI
+ case ARPHRD_FDDI:
+ arp->ar_hrd = htons(ARPHRD_ETHER);
+ arp->ar_pro = htons(ETH_P_IP);
+ break;
+#endif
+#ifdef CONFIG_TR
+ case ARPHRD_IEEE802_TR:
+ arp->ar_hrd = htons(ARPHRD_IEEE802);
+ arp->ar_pro = htons(ETH_P_IP);
+ break;
+#endif
+ }
+
+ arp->ar_hln = dev->addr_len;
+ arp->ar_pln = 4;
+ arp->ar_op = htons(type);
+
+ arp_ptr=(unsigned char *)(arp+1);
+
+ memcpy(arp_ptr, src_hw, dev->addr_len);
+ arp_ptr+=dev->addr_len;
+ memcpy(arp_ptr, &src_ip,4);
+ arp_ptr+=4;
+ if (target_hw != NULL)
+ memcpy(arp_ptr, target_hw, dev->addr_len);
+ else
+ memset(arp_ptr, 0, dev->addr_len);
+ arp_ptr+=dev->addr_len;
+ memcpy(arp_ptr, &dest_ip, 4);
+
+ return skb;
+
+out:
+ kfree_skb(skb);
+ return NULL;
+}
+
+/*
+ * Send an arp packet.
+ */
+void arp_xmit(struct sk_buff *skb)
+{
+ /* Send it off, maybe filter it using firewalling first. */
+ NF_HOOK(NF_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit);
+}
+
+/*
+ * Create and send an arp packet.
+ */
+void arp_send(int type, int ptype, u32 dest_ip,
+ struct net_device *dev, u32 src_ip,
+ unsigned char *dest_hw, unsigned char *src_hw,
+ unsigned char *target_hw)
+{
+ struct sk_buff *skb;
+
+ /*
+ * No arp on this interface.
+ */
+
+ if (dev->flags&IFF_NOARP)
+ return;
+
+ skb = arp_create(type, ptype, dest_ip, dev, src_ip,
+ dest_hw, src_hw, target_hw);
+ if (skb == NULL) {
+ return;
+ }
+
+ arp_xmit(skb);
+}
+
+static void parp_redo(struct sk_buff *skb)
+{
+ nf_reset(skb);
+ arp_rcv(skb, skb->dev, NULL);
+}
+
+/*
+ * Process an arp request.
+ */
+
+static int arp_process(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct in_device *in_dev = in_dev_get(dev);
+ struct arphdr *arp;
+ unsigned char *arp_ptr;
+ struct rtable *rt;
+ unsigned char *sha, *tha;
+ u32 sip, tip;
+ u16 dev_type = dev->type;
+ int addr_type;
+ struct neighbour *n;
+
+ /* arp_rcv below verifies the ARP header and verifies the device
+ * is ARP'able.
+ */
+
+ if (in_dev == NULL)
+ goto out;
+
+ arp = skb->nh.arph;
+
+ switch (dev_type) {
+ default:
+ if (arp->ar_pro != htons(ETH_P_IP) ||
+ htons(dev_type) != arp->ar_hrd)
+ goto out;
+ break;
+#ifdef CONFIG_NET_ETHERNET
+ case ARPHRD_ETHER:
+#endif
+#ifdef CONFIG_TR
+ case ARPHRD_IEEE802_TR:
+#endif
+#ifdef CONFIG_FDDI
+ case ARPHRD_FDDI:
+#endif
+#ifdef CONFIG_NET_FC
+ case ARPHRD_IEEE802:
+#endif
+#if defined(CONFIG_NET_ETHERNET) || defined(CONFIG_TR) || \
+ defined(CONFIG_FDDI) || defined(CONFIG_NET_FC)
+ /*
+ * ETHERNET, Token Ring and Fibre Channel (which are IEEE 802
+ * devices, according to RFC 2625) devices will accept ARP
+ * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
+ * This is the case also of FDDI, where the RFC 1390 says that
+ * FDDI devices should accept ARP hardware of (1) Ethernet,
+ * however, to be more robust, we'll accept both 1 (Ethernet)
+ * or 6 (IEEE 802.2)
+ */
+ if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
+ arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
+ arp->ar_pro != htons(ETH_P_IP))
+ goto out;
+ break;
+#endif
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+ case ARPHRD_AX25:
+ if (arp->ar_pro != htons(AX25_P_IP) ||
+ arp->ar_hrd != htons(ARPHRD_AX25))
+ goto out;
+ break;
+#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+ case ARPHRD_NETROM:
+ if (arp->ar_pro != htons(AX25_P_IP) ||
+ arp->ar_hrd != htons(ARPHRD_NETROM))
+ goto out;
+ break;
+#endif
+#endif
+ }
+
+ /* Understand only these message types */
+
+ if (arp->ar_op != htons(ARPOP_REPLY) &&
+ arp->ar_op != htons(ARPOP_REQUEST))
+ goto out;
+
+/*
+ * Extract fields
+ */
+ arp_ptr= (unsigned char *)(arp+1);
+ sha = arp_ptr;
+ arp_ptr += dev->addr_len;
+ memcpy(&sip, arp_ptr, 4);
+ arp_ptr += 4;
+ tha = arp_ptr;
+ arp_ptr += dev->addr_len;
+ memcpy(&tip, arp_ptr, 4);
+/*
+ * Check for bad requests for 127.x.x.x and requests for multicast
+ * addresses. If this is one such, delete it.
+ */
+ if (LOOPBACK(tip) || MULTICAST(tip))
+ goto out;
+
+/*
+ * Special case: We must set Frame Relay source Q.922 address
+ */
+ if (dev_type == ARPHRD_DLCI)
+ sha = dev->broadcast;
+
+/*
+ * Process entry. The idea here is we want to send a reply if it is a
+ * request for us or if it is a request for someone else that we hold
+ * a proxy for. We want to add an entry to our cache if it is a reply
+ * to us or if it is a request for our address.
+ * (The assumption for this last is that if someone is requesting our
+ * address, they are probably intending to talk to us, so it saves time
+ * if we cache their address. Their address is also probably not in
+ * our cache, since ours is not in their cache.)
+ *
+ * Putting this another way, we only care about replies if they are to
+ * us, in which case we add them to the cache. For requests, we care
+ * about those for us and those for our proxies. We reply to both,
+ * and in the case of requests for us we add the requester to the arp
+ * cache.
+ */
+
+ /* Special case: IPv4 duplicate address detection packet (RFC2131) */
+ if (sip == 0) {
+ if (arp->ar_op == htons(ARPOP_REQUEST) &&
+ inet_addr_type(tip) == RTN_LOCAL &&
+ !arp_ignore(in_dev,dev,sip,tip))
+ arp_send(ARPOP_REPLY,ETH_P_ARP,tip,dev,tip,sha,dev->dev_addr,dev->dev_addr);
+ goto out;
+ }
+
+ if (arp->ar_op == htons(ARPOP_REQUEST) &&
+ ip_route_input(skb, tip, sip, 0, dev) == 0) {
+
+ rt = (struct rtable*)skb->dst;
+ addr_type = rt->rt_type;
+
+ if (addr_type == RTN_LOCAL) {
+ n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
+ if (n) {
+ int dont_send = 0;
+
+ if (!dont_send)
+ dont_send |= arp_ignore(in_dev,dev,sip,tip);
+ if (!dont_send && IN_DEV_ARPFILTER(in_dev))
+ dont_send |= arp_filter(sip,tip,dev);
+ if (!dont_send)
+ arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
+
+ neigh_release(n);
+ }
+ goto out;
+ } else if (IN_DEV_FORWARD(in_dev)) {
+ if ((rt->rt_flags&RTCF_DNAT) ||
+ (addr_type == RTN_UNICAST && rt->u.dst.dev != dev &&
+ (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, &tip, dev, 0)))) {
+ n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
+ if (n)
+ neigh_release(n);
+
+ if (skb->stamp.tv_sec == LOCALLY_ENQUEUED ||
+ skb->pkt_type == PACKET_HOST ||
+ in_dev->arp_parms->proxy_delay == 0) {
+ arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
+ } else {
+ pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb);
+ in_dev_put(in_dev);
+ return 0;
+ }
+ goto out;
+ }
+ }
+ }
+
+ /* Update our ARP tables */
+
+ n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
+
+#ifdef CONFIG_IP_ACCEPT_UNSOLICITED_ARP
+ /* Unsolicited ARP is not accepted by default.
+ It is possible, that this option should be enabled for some
+ devices (strip is candidate)
+ */
+ if (n == NULL &&
+ arp->ar_op == htons(ARPOP_REPLY) &&
+ inet_addr_type(sip) == RTN_UNICAST)
+ n = __neigh_lookup(&arp_tbl, &sip, dev, -1);
+#endif
+
+ if (n) {
+ int state = NUD_REACHABLE;
+ int override;
+
+ /* If several different ARP replies follows back-to-back,
+ use the FIRST one. It is possible, if several proxy
+ agents are active. Taking the first reply prevents
+ arp trashing and chooses the fastest router.
+ */
+ override = time_after(jiffies, n->updated + n->parms->locktime);
+
+ /* Broadcast replies and request packets
+ do not assert neighbour reachability.
+ */
+ if (arp->ar_op != htons(ARPOP_REPLY) ||
+ skb->pkt_type != PACKET_HOST)
+ state = NUD_STALE;
+ neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0);
+ neigh_release(n);
+ }
+
+out:
+ if (in_dev)
+ in_dev_put(in_dev);
+ kfree_skb(skb);
+ return 0;
+}
+
+
+/*
+ * Receive an arp request from the device layer.
+ */
+
+int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+{
+ struct arphdr *arp;
+
+ /* ARP header, plus 2 device addresses, plus 2 IP addresses. */
+ if (!pskb_may_pull(skb, (sizeof(struct arphdr) +
+ (2 * dev->addr_len) +
+ (2 * sizeof(u32)))))
+ goto freeskb;
+
+ arp = skb->nh.arph;
+ if (arp->ar_hln != dev->addr_len ||
+ dev->flags & IFF_NOARP ||
+ skb->pkt_type == PACKET_OTHERHOST ||
+ skb->pkt_type == PACKET_LOOPBACK ||
+ arp->ar_pln != 4)
+ goto freeskb;
+
+ if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+ goto out_of_mem;
+
+ return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
+
+freeskb:
+ kfree_skb(skb);
+out_of_mem:
+ return 0;
+}
+
+/*
+ * User level interface (ioctl)
+ */
+
+/*
+ * Set (create) an ARP cache entry.
+ */
+
+static int arp_req_set(struct arpreq *r, struct net_device * dev)
+{
+ u32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
+ struct neighbour *neigh;
+ int err;
+
+ if (r->arp_flags&ATF_PUBL) {
+ u32 mask = ((struct sockaddr_in *) &r->arp_netmask)->sin_addr.s_addr;
+ if (mask && mask != 0xFFFFFFFF)
+ return -EINVAL;
+ if (!dev && (r->arp_flags & ATF_COM)) {
+ dev = dev_getbyhwaddr(r->arp_ha.sa_family, r->arp_ha.sa_data);
+ if (!dev)
+ return -ENODEV;
+ }
+ if (mask) {
+ if (pneigh_lookup(&arp_tbl, &ip, dev, 1) == NULL)
+ return -ENOBUFS;
+ return 0;
+ }
+ if (dev == NULL) {
+ ipv4_devconf.proxy_arp = 1;
+ return 0;
+ }
+ if (__in_dev_get(dev)) {
+ __in_dev_get(dev)->cnf.proxy_arp = 1;
+ return 0;
+ }
+ return -ENXIO;
+ }
+
+ if (r->arp_flags & ATF_PERM)
+ r->arp_flags |= ATF_COM;
+ if (dev == NULL) {
+ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
+ .tos = RTO_ONLINK } } };
+ struct rtable * rt;
+ if ((err = ip_route_output_key(&rt, &fl)) != 0)
+ return err;
+ dev = rt->u.dst.dev;
+ ip_rt_put(rt);
+ if (!dev)
+ return -EINVAL;
+ }
+ switch (dev->type) {
+#ifdef CONFIG_FDDI
+ case ARPHRD_FDDI:
+ /*
+ * According to RFC 1390, FDDI devices should accept ARP
+ * hardware types of 1 (Ethernet). However, to be more
+ * robust, we'll accept hardware types of either 1 (Ethernet)
+ * or 6 (IEEE 802.2).
+ */
+ if (r->arp_ha.sa_family != ARPHRD_FDDI &&
+ r->arp_ha.sa_family != ARPHRD_ETHER &&
+ r->arp_ha.sa_family != ARPHRD_IEEE802)
+ return -EINVAL;
+ break;
+#endif
+ default:
+ if (r->arp_ha.sa_family != dev->type)
+ return -EINVAL;
+ break;
+ }
+
+ neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev);
+ err = PTR_ERR(neigh);
+ if (!IS_ERR(neigh)) {
+ unsigned state = NUD_STALE;
+ if (r->arp_flags & ATF_PERM)
+ state = NUD_PERMANENT;
+ err = neigh_update(neigh, (r->arp_flags&ATF_COM) ?
+ r->arp_ha.sa_data : NULL, state,
+ NEIGH_UPDATE_F_OVERRIDE|
+ NEIGH_UPDATE_F_ADMIN);
+ neigh_release(neigh);
+ }
+ return err;
+}
+
+static unsigned arp_state_to_flags(struct neighbour *neigh)
+{
+ unsigned flags = 0;
+ if (neigh->nud_state&NUD_PERMANENT)
+ flags = ATF_PERM|ATF_COM;
+ else if (neigh->nud_state&NUD_VALID)
+ flags = ATF_COM;
+ return flags;
+}
+
+/*
+ * Get an ARP cache entry.
+ */
+
+static int arp_req_get(struct arpreq *r, struct net_device *dev)
+{
+ u32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
+ struct neighbour *neigh;
+ int err = -ENXIO;
+
+ neigh = neigh_lookup(&arp_tbl, &ip, dev);
+ if (neigh) {
+ read_lock_bh(&neigh->lock);
+ memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
+ r->arp_flags = arp_state_to_flags(neigh);
+ read_unlock_bh(&neigh->lock);
+ r->arp_ha.sa_family = dev->type;
+ strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
+ neigh_release(neigh);
+ err = 0;
+ }
+ return err;
+}
+
+static int arp_req_delete(struct arpreq *r, struct net_device * dev)
+{
+ int err;
+ u32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+ struct neighbour *neigh;
+
+ if (r->arp_flags & ATF_PUBL) {
+ u32 mask =
+ ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
+ if (mask == 0xFFFFFFFF)
+ return pneigh_delete(&arp_tbl, &ip, dev);
+ if (mask == 0) {
+ if (dev == NULL) {
+ ipv4_devconf.proxy_arp = 0;
+ return 0;
+ }
+ if (__in_dev_get(dev)) {
+ __in_dev_get(dev)->cnf.proxy_arp = 0;
+ return 0;
+ }
+ return -ENXIO;
+ }
+ return -EINVAL;
+ }
+
+ if (dev == NULL) {
+ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
+ .tos = RTO_ONLINK } } };
+ struct rtable * rt;
+ if ((err = ip_route_output_key(&rt, &fl)) != 0)
+ return err;
+ dev = rt->u.dst.dev;
+ ip_rt_put(rt);
+ if (!dev)
+ return -EINVAL;
+ }
+ err = -ENXIO;
+ neigh = neigh_lookup(&arp_tbl, &ip, dev);
+ if (neigh) {
+ if (neigh->nud_state&~NUD_NOARP)
+ err = neigh_update(neigh, NULL, NUD_FAILED,
+ NEIGH_UPDATE_F_OVERRIDE|
+ NEIGH_UPDATE_F_ADMIN);
+ neigh_release(neigh);
+ }
+ return err;
+}
+
+/*
+ * Handle an ARP layer I/O control request.
+ */
+
+int arp_ioctl(unsigned int cmd, void __user *arg)
+{
+ int err;
+ struct arpreq r;
+ struct net_device *dev = NULL;
+
+ switch (cmd) {
+ case SIOCDARP:
+ case SIOCSARP:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ case SIOCGARP:
+ err = copy_from_user(&r, arg, sizeof(struct arpreq));
+ if (err)
+ return -EFAULT;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (r.arp_pa.sa_family != AF_INET)
+ return -EPFNOSUPPORT;
+
+ if (!(r.arp_flags & ATF_PUBL) &&
+ (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB)))
+ return -EINVAL;
+ if (!(r.arp_flags & ATF_NETMASK))
+ ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
+ htonl(0xFFFFFFFFUL);
+ rtnl_lock();
+ if (r.arp_dev[0]) {
+ err = -ENODEV;
+ if ((dev = __dev_get_by_name(r.arp_dev)) == NULL)
+ goto out;
+
+ /* Mmmm... It is wrong... ARPHRD_NETROM==0 */
+ if (!r.arp_ha.sa_family)
+ r.arp_ha.sa_family = dev->type;
+ err = -EINVAL;
+ if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type)
+ goto out;
+ } else if (cmd == SIOCGARP) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ switch(cmd) {
+ case SIOCDARP:
+ err = arp_req_delete(&r, dev);
+ break;
+ case SIOCSARP:
+ err = arp_req_set(&r, dev);
+ break;
+ case SIOCGARP:
+ err = arp_req_get(&r, dev);
+ if (!err && copy_to_user(arg, &r, sizeof(r)))
+ err = -EFAULT;
+ break;
+ }
+out:
+ rtnl_unlock();
+ return err;
+}
+
+static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *dev = ptr;
+
+ switch (event) {
+ case NETDEV_CHANGEADDR:
+ neigh_changeaddr(&arp_tbl, dev);
+ rt_cache_flush(0);
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block arp_netdev_notifier = {
+ .notifier_call = arp_netdev_event,
+};
+
+/* Note, that it is not on notifier chain.
+ It is necessary, that this routine was called after route cache will be
+ flushed.
+ */
+void arp_ifdown(struct net_device *dev)
+{
+ neigh_ifdown(&arp_tbl, dev);
+}
+
+
+/*
+ * Called once on startup.
+ */
+
+static struct packet_type arp_packet_type = {
+ .type = __constant_htons(ETH_P_ARP),
+ .func = arp_rcv,
+};
+
+static int arp_proc_init(void);
+
+void __init arp_init(void)
+{
+ neigh_table_init(&arp_tbl);
+
+ dev_add_pack(&arp_packet_type);
+ arp_proc_init();
+#ifdef CONFIG_SYSCTL
+ neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4,
+ NET_IPV4_NEIGH, "ipv4", NULL, NULL);
+#endif
+ register_netdevice_notifier(&arp_netdev_notifier);
+}
+
+#ifdef CONFIG_PROC_FS
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+
+/* ------------------------------------------------------------------------ */
+/*
+ * ax25 -> ASCII conversion
+ */
+static char *ax2asc2(ax25_address *a, char *buf)
+{
+ char c, *s;
+ int n;
+
+ for (n = 0, s = buf; n < 6; n++) {
+ c = (a->ax25_call[n] >> 1) & 0x7F;
+
+ if (c != ' ') *s++ = c;
+ }
+
+ *s++ = '-';
+
+ if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) {
+ *s++ = '1';
+ n -= 10;
+ }
+
+ *s++ = n + '0';
+ *s++ = '\0';
+
+ if (*buf == '\0' || *buf == '-')
+ return "*";
+
+ return buf;
+
+}
+#endif /* CONFIG_AX25 */
+
+#define HBUFFERLEN 30
+
+static void arp_format_neigh_entry(struct seq_file *seq,
+ struct neighbour *n)
+{
+ char hbuffer[HBUFFERLEN];
+ const char hexbuf[] = "0123456789ABCDEF";
+ int k, j;
+ char tbuf[16];
+ struct net_device *dev = n->dev;
+ int hatype = dev->type;
+
+ read_lock(&n->lock);
+ /* Convert hardware address to XX:XX:XX:XX ... form. */
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+ if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
+ ax2asc2((ax25_address *)n->ha, hbuffer);
+ else {
+#endif
+ for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) {
+ hbuffer[k++] = hexbuf[(n->ha[j] >> 4) & 15];
+ hbuffer[k++] = hexbuf[n->ha[j] & 15];
+ hbuffer[k++] = ':';
+ }
+ hbuffer[--k] = 0;
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+ }
+#endif
+ sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->primary_key));
+ seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n",
+ tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name);
+ read_unlock(&n->lock);
+}
+
+static void arp_format_pneigh_entry(struct seq_file *seq,
+ struct pneigh_entry *n)
+{
+ struct net_device *dev = n->dev;
+ int hatype = dev ? dev->type : 0;
+ char tbuf[16];
+
+ sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->key));
+ seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n",
+ tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00",
+ dev ? dev->name : "*");
+}
+
+static int arp_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, "IP address HW type Flags "
+ "HW address Mask Device\n");
+ } else {
+ struct neigh_seq_state *state = seq->private;
+
+ if (state->flags & NEIGH_SEQ_IS_PNEIGH)
+ arp_format_pneigh_entry(seq, v);
+ else
+ arp_format_neigh_entry(seq, v);
+ }
+
+ return 0;
+}
+
+static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ /* Don't want to confuse "arp -a" w/ magic entries,
+ * so we tell the generic iterator to skip NUD_NOARP.
+ */
+ return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP);
+}
+
+/* ------------------------------------------------------------------------ */
+
+static struct seq_operations arp_seq_ops = {
+ .start = arp_seq_start,
+ .next = neigh_seq_next,
+ .stop = neigh_seq_stop,
+ .show = arp_seq_show,
+};
+
+static int arp_seq_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int rc = -ENOMEM;
+ struct neigh_seq_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (!s)
+ goto out;
+
+ memset(s, 0, sizeof(*s));
+ rc = seq_open(file, &arp_seq_ops);
+ if (rc)
+ goto out_kfree;
+
+ seq = file->private_data;
+ seq->private = s;
+out:
+ return rc;
+out_kfree:
+ kfree(s);
+ goto out;
+}
+
+static struct file_operations arp_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = arp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static int __init arp_proc_init(void)
+{
+ if (!proc_net_fops_create("arp", S_IRUGO, &arp_seq_fops))
+ return -ENOMEM;
+ return 0;
+}
+
+#else /* CONFIG_PROC_FS */
+
+static int __init arp_proc_init(void)
+{
+ return 0;
+}
+
+#endif /* CONFIG_PROC_FS */
+
+EXPORT_SYMBOL(arp_broken_ops);
+EXPORT_SYMBOL(arp_find);
+EXPORT_SYMBOL(arp_rcv);
+EXPORT_SYMBOL(arp_create);
+EXPORT_SYMBOL(arp_xmit);
+EXPORT_SYMBOL(arp_send);
+EXPORT_SYMBOL(arp_tbl);
+
+#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
+EXPORT_SYMBOL(clip_tbl_hook);
+#endif
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
new file mode 100644
index 000000000000..b1db561f2542
--- /dev/null
+++ b/net/ipv4/datagram.c
@@ -0,0 +1,73 @@
+/*
+ * common UDP/RAW code
+ * Linux INET implementation
+ *
+ * Authors:
+ * Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
+ struct rtable *rt;
+ u32 saddr;
+ int oif;
+ int err;
+
+
+ if (addr_len < sizeof(*usin))
+ return -EINVAL;
+
+ if (usin->sin_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ sk_dst_reset(sk);
+
+ oif = sk->sk_bound_dev_if;
+ saddr = inet->saddr;
+ if (MULTICAST(usin->sin_addr.s_addr)) {
+ if (!oif)
+ oif = inet->mc_index;
+ if (!saddr)
+ saddr = inet->mc_addr;
+ }
+ err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr,
+ RT_CONN_FLAGS(sk), oif,
+ sk->sk_protocol,
+ inet->sport, usin->sin_port, sk);
+ if (err)
+ return err;
+ if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) {
+ ip_rt_put(rt);
+ return -EACCES;
+ }
+ if (!inet->saddr)
+ inet->saddr = rt->rt_src; /* Update source address */
+ if (!inet->rcv_saddr)
+ inet->rcv_saddr = rt->rt_src;
+ inet->daddr = rt->rt_dst;
+ inet->dport = usin->sin_port;
+ sk->sk_state = TCP_ESTABLISHED;
+ inet->id = jiffies;
+
+ sk_dst_set(sk, &rt->u.dst);
+ return(0);
+}
+
+EXPORT_SYMBOL(ip4_datagram_connect);
+
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
new file mode 100644
index 000000000000..eea7ef010776
--- /dev/null
+++ b/net/ipv4/devinet.c
@@ -0,0 +1,1508 @@
+/*
+ * NET3 IP device support routines.
+ *
+ * Version: $Id: devinet.c,v 1.44 2001/10/31 21:55:54 davem Exp $
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Derived from the IP parts of dev.c 1.0.19
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *
+ * Additional Authors:
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ * Alexey Kuznetsov: pa_* fields are replaced with ifaddr
+ * lists.
+ * Cyrus Durgin: updated for kmod
+ * Matthias Andree: in devinet_ioctl, compare label and
+ * address (4.4BSD alias style support),
+ * fall back to comparing just the label
+ * if no match found.
+ */
+
+#include <linux/config.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+#include <linux/kmod.h>
+
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+
+struct ipv4_devconf ipv4_devconf = {
+ .accept_redirects = 1,
+ .send_redirects = 1,
+ .secure_redirects = 1,
+ .shared_media = 1,
+};
+
+static struct ipv4_devconf ipv4_devconf_dflt = {
+ .accept_redirects = 1,
+ .send_redirects = 1,
+ .secure_redirects = 1,
+ .shared_media = 1,
+ .accept_source_route = 1,
+};
+
+static void rtmsg_ifa(int event, struct in_ifaddr *);
+
+static struct notifier_block *inetaddr_chain;
+static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+ int destroy);
+#ifdef CONFIG_SYSCTL
+static void devinet_sysctl_register(struct in_device *in_dev,
+ struct ipv4_devconf *p);
+static void devinet_sysctl_unregister(struct ipv4_devconf *p);
+#endif
+
+/* Locks all the inet devices. */
+
+static struct in_ifaddr *inet_alloc_ifa(void)
+{
+ struct in_ifaddr *ifa = kmalloc(sizeof(*ifa), GFP_KERNEL);
+
+ if (ifa) {
+ memset(ifa, 0, sizeof(*ifa));
+ INIT_RCU_HEAD(&ifa->rcu_head);
+ }
+
+ return ifa;
+}
+
+static void inet_rcu_free_ifa(struct rcu_head *head)
+{
+ struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head);
+ if (ifa->ifa_dev)
+ in_dev_put(ifa->ifa_dev);
+ kfree(ifa);
+}
+
+static inline void inet_free_ifa(struct in_ifaddr *ifa)
+{
+ call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
+}
+
+void in_dev_finish_destroy(struct in_device *idev)
+{
+ struct net_device *dev = idev->dev;
+
+ BUG_TRAP(!idev->ifa_list);
+ BUG_TRAP(!idev->mc_list);
+#ifdef NET_REFCNT_DEBUG
+ printk(KERN_DEBUG "in_dev_finish_destroy: %p=%s\n",
+ idev, dev ? dev->name : "NIL");
+#endif
+ dev_put(dev);
+ if (!idev->dead)
+ printk("Freeing alive in_device %p\n", idev);
+ else {
+ kfree(idev);
+ }
+}
+
+struct in_device *inetdev_init(struct net_device *dev)
+{
+ struct in_device *in_dev;
+
+ ASSERT_RTNL();
+
+ in_dev = kmalloc(sizeof(*in_dev), GFP_KERNEL);
+ if (!in_dev)
+ goto out;
+ memset(in_dev, 0, sizeof(*in_dev));
+ INIT_RCU_HEAD(&in_dev->rcu_head);
+ memcpy(&in_dev->cnf, &ipv4_devconf_dflt, sizeof(in_dev->cnf));
+ in_dev->cnf.sysctl = NULL;
+ in_dev->dev = dev;
+ if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL)
+ goto out_kfree;
+ /* Reference in_dev->dev */
+ dev_hold(dev);
+#ifdef CONFIG_SYSCTL
+ neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4,
+ NET_IPV4_NEIGH, "ipv4", NULL, NULL);
+#endif
+
+ /* Account for reference dev->ip_ptr */
+ in_dev_hold(in_dev);
+ rcu_assign_pointer(dev->ip_ptr, in_dev);
+
+#ifdef CONFIG_SYSCTL
+ devinet_sysctl_register(in_dev, &in_dev->cnf);
+#endif
+ ip_mc_init_dev(in_dev);
+ if (dev->flags & IFF_UP)
+ ip_mc_up(in_dev);
+out:
+ return in_dev;
+out_kfree:
+ kfree(in_dev);
+ in_dev = NULL;
+ goto out;
+}
+
+static void in_dev_rcu_put(struct rcu_head *head)
+{
+ struct in_device *idev = container_of(head, struct in_device, rcu_head);
+ in_dev_put(idev);
+}
+
+static void inetdev_destroy(struct in_device *in_dev)
+{
+ struct in_ifaddr *ifa;
+ struct net_device *dev;
+
+ ASSERT_RTNL();
+
+ dev = in_dev->dev;
+ if (dev == &loopback_dev)
+ return;
+
+ in_dev->dead = 1;
+
+ ip_mc_destroy_dev(in_dev);
+
+ while ((ifa = in_dev->ifa_list) != NULL) {
+ inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
+ inet_free_ifa(ifa);
+ }
+
+#ifdef CONFIG_SYSCTL
+ devinet_sysctl_unregister(&in_dev->cnf);
+#endif
+
+ dev->ip_ptr = NULL;
+
+#ifdef CONFIG_SYSCTL
+ neigh_sysctl_unregister(in_dev->arp_parms);
+#endif
+ neigh_parms_release(&arp_tbl, in_dev->arp_parms);
+ arp_ifdown(dev);
+
+ call_rcu(&in_dev->rcu_head, in_dev_rcu_put);
+}
+
+int inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b)
+{
+ rcu_read_lock();
+ for_primary_ifa(in_dev) {
+ if (inet_ifa_match(a, ifa)) {
+ if (!b || inet_ifa_match(b, ifa)) {
+ rcu_read_unlock();
+ return 1;
+ }
+ }
+ } endfor_ifa(in_dev);
+ rcu_read_unlock();
+ return 0;
+}
+
+static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+ int destroy)
+{
+ struct in_ifaddr *ifa1 = *ifap;
+
+ ASSERT_RTNL();
+
+ /* 1. Deleting primary ifaddr forces deletion all secondaries */
+
+ if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) {
+ struct in_ifaddr *ifa;
+ struct in_ifaddr **ifap1 = &ifa1->ifa_next;
+
+ while ((ifa = *ifap1) != NULL) {
+ if (!(ifa->ifa_flags & IFA_F_SECONDARY) ||
+ ifa1->ifa_mask != ifa->ifa_mask ||
+ !inet_ifa_match(ifa1->ifa_address, ifa)) {
+ ifap1 = &ifa->ifa_next;
+ continue;
+ }
+
+ *ifap1 = ifa->ifa_next;
+
+ rtmsg_ifa(RTM_DELADDR, ifa);
+ notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa);
+ inet_free_ifa(ifa);
+ }
+ }
+
+ /* 2. Unlink it */
+
+ *ifap = ifa1->ifa_next;
+
+ /* 3. Announce address deletion */
+
+ /* Send message first, then call notifier.
+ At first sight, FIB update triggered by notifier
+ will refer to already deleted ifaddr, that could confuse
+ netlink listeners. It is not true: look, gated sees
+ that route deleted and if it still thinks that ifaddr
+ is valid, it will try to restore deleted routes... Grr.
+ So that, this order is correct.
+ */
+ rtmsg_ifa(RTM_DELADDR, ifa1);
+ notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
+ if (destroy) {
+ inet_free_ifa(ifa1);
+
+ if (!in_dev->ifa_list)
+ inetdev_destroy(in_dev);
+ }
+}
+
+static int inet_insert_ifa(struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct in_ifaddr *ifa1, **ifap, **last_primary;
+
+ ASSERT_RTNL();
+
+ if (!ifa->ifa_local) {
+ inet_free_ifa(ifa);
+ return 0;
+ }
+
+ ifa->ifa_flags &= ~IFA_F_SECONDARY;
+ last_primary = &in_dev->ifa_list;
+
+ for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
+ ifap = &ifa1->ifa_next) {
+ if (!(ifa1->ifa_flags & IFA_F_SECONDARY) &&
+ ifa->ifa_scope <= ifa1->ifa_scope)
+ last_primary = &ifa1->ifa_next;
+ if (ifa1->ifa_mask == ifa->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, ifa)) {
+ if (ifa1->ifa_local == ifa->ifa_local) {
+ inet_free_ifa(ifa);
+ return -EEXIST;
+ }
+ if (ifa1->ifa_scope != ifa->ifa_scope) {
+ inet_free_ifa(ifa);
+ return -EINVAL;
+ }
+ ifa->ifa_flags |= IFA_F_SECONDARY;
+ }
+ }
+
+ if (!(ifa->ifa_flags & IFA_F_SECONDARY)) {
+ net_srandom(ifa->ifa_local);
+ ifap = last_primary;
+ }
+
+ ifa->ifa_next = *ifap;
+ *ifap = ifa;
+
+ /* Send message first, then call notifier.
+ Notifier will trigger FIB update, so that
+ listeners of netlink will know about new ifaddr */
+ rtmsg_ifa(RTM_NEWADDR, ifa);
+ notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
+
+ return 0;
+}
+
+static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = __in_dev_get(dev);
+
+ ASSERT_RTNL();
+
+ if (!in_dev) {
+ in_dev = inetdev_init(dev);
+ if (!in_dev) {
+ inet_free_ifa(ifa);
+ return -ENOBUFS;
+ }
+ }
+ if (ifa->ifa_dev != in_dev) {
+ BUG_TRAP(!ifa->ifa_dev);
+ in_dev_hold(in_dev);
+ ifa->ifa_dev = in_dev;
+ }
+ if (LOOPBACK(ifa->ifa_local))
+ ifa->ifa_scope = RT_SCOPE_HOST;
+ return inet_insert_ifa(ifa);
+}
+
+struct in_device *inetdev_by_index(int ifindex)
+{
+ struct net_device *dev;
+ struct in_device *in_dev = NULL;
+ read_lock(&dev_base_lock);
+ dev = __dev_get_by_index(ifindex);
+ if (dev)
+ in_dev = in_dev_get(dev);
+ read_unlock(&dev_base_lock);
+ return in_dev;
+}
+
+/* Called only from RTNL semaphored context. No locks. */
+
+struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, u32 prefix,
+ u32 mask)
+{
+ ASSERT_RTNL();
+
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa))
+ return ifa;
+ } endfor_ifa(in_dev);
+ return NULL;
+}
+
+static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+ struct rtattr **rta = arg;
+ struct in_device *in_dev;
+ struct ifaddrmsg *ifm = NLMSG_DATA(nlh);
+ struct in_ifaddr *ifa, **ifap;
+
+ ASSERT_RTNL();
+
+ if ((in_dev = inetdev_by_index(ifm->ifa_index)) == NULL)
+ goto out;
+ __in_dev_put(in_dev);
+
+ for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+ ifap = &ifa->ifa_next) {
+ if ((rta[IFA_LOCAL - 1] &&
+ memcmp(RTA_DATA(rta[IFA_LOCAL - 1]),
+ &ifa->ifa_local, 4)) ||
+ (rta[IFA_LABEL - 1] &&
+ rtattr_strcmp(rta[IFA_LABEL - 1], ifa->ifa_label)) ||
+ (rta[IFA_ADDRESS - 1] &&
+ (ifm->ifa_prefixlen != ifa->ifa_prefixlen ||
+ !inet_ifa_match(*(u32*)RTA_DATA(rta[IFA_ADDRESS - 1]),
+ ifa))))
+ continue;
+ inet_del_ifa(in_dev, ifap, 1);
+ return 0;
+ }
+out:
+ return -EADDRNOTAVAIL;
+}
+
+static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+ struct rtattr **rta = arg;
+ struct net_device *dev;
+ struct in_device *in_dev;
+ struct ifaddrmsg *ifm = NLMSG_DATA(nlh);
+ struct in_ifaddr *ifa;
+ int rc = -EINVAL;
+
+ ASSERT_RTNL();
+
+ if (ifm->ifa_prefixlen > 32 || !rta[IFA_LOCAL - 1])
+ goto out;
+
+ rc = -ENODEV;
+ if ((dev = __dev_get_by_index(ifm->ifa_index)) == NULL)
+ goto out;
+
+ rc = -ENOBUFS;
+ if ((in_dev = __in_dev_get(dev)) == NULL) {
+ in_dev = inetdev_init(dev);
+ if (!in_dev)
+ goto out;
+ }
+
+ if ((ifa = inet_alloc_ifa()) == NULL)
+ goto out;
+
+ if (!rta[IFA_ADDRESS - 1])
+ rta[IFA_ADDRESS - 1] = rta[IFA_LOCAL - 1];
+ memcpy(&ifa->ifa_local, RTA_DATA(rta[IFA_LOCAL - 1]), 4);
+ memcpy(&ifa->ifa_address, RTA_DATA(rta[IFA_ADDRESS - 1]), 4);
+ ifa->ifa_prefixlen = ifm->ifa_prefixlen;
+ ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
+ if (rta[IFA_BROADCAST - 1])
+ memcpy(&ifa->ifa_broadcast,
+ RTA_DATA(rta[IFA_BROADCAST - 1]), 4);
+ if (rta[IFA_ANYCAST - 1])
+ memcpy(&ifa->ifa_anycast, RTA_DATA(rta[IFA_ANYCAST - 1]), 4);
+ ifa->ifa_flags = ifm->ifa_flags;
+ ifa->ifa_scope = ifm->ifa_scope;
+ in_dev_hold(in_dev);
+ ifa->ifa_dev = in_dev;
+ if (rta[IFA_LABEL - 1])
+ rtattr_strlcpy(ifa->ifa_label, rta[IFA_LABEL - 1], IFNAMSIZ);
+ else
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+
+ rc = inet_insert_ifa(ifa);
+out:
+ return rc;
+}
+
+/*
+ * Determine a default network mask, based on the IP address.
+ */
+
+static __inline__ int inet_abc_len(u32 addr)
+{
+ int rc = -1; /* Something else, probably a multicast. */
+
+ if (ZERONET(addr))
+ rc = 0;
+ else {
+ addr = ntohl(addr);
+
+ if (IN_CLASSA(addr))
+ rc = 8;
+ else if (IN_CLASSB(addr))
+ rc = 16;
+ else if (IN_CLASSC(addr))
+ rc = 24;
+ }
+
+ return rc;
+}
+
+
+int devinet_ioctl(unsigned int cmd, void __user *arg)
+{
+ struct ifreq ifr;
+ struct sockaddr_in sin_orig;
+ struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
+ struct in_device *in_dev;
+ struct in_ifaddr **ifap = NULL;
+ struct in_ifaddr *ifa = NULL;
+ struct net_device *dev;
+ char *colon;
+ int ret = -EFAULT;
+ int tryaddrmatch = 0;
+
+ /*
+ * Fetch the caller's info block into kernel space
+ */
+
+ if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+ goto out;
+ ifr.ifr_name[IFNAMSIZ - 1] = 0;
+
+ /* save original address for comparison */
+ memcpy(&sin_orig, sin, sizeof(*sin));
+
+ colon = strchr(ifr.ifr_name, ':');
+ if (colon)
+ *colon = 0;
+
+#ifdef CONFIG_KMOD
+ dev_load(ifr.ifr_name);
+#endif
+
+ switch(cmd) {
+ case SIOCGIFADDR: /* Get interface address */
+ case SIOCGIFBRDADDR: /* Get the broadcast address */
+ case SIOCGIFDSTADDR: /* Get the destination address */
+ case SIOCGIFNETMASK: /* Get the netmask for the interface */
+ /* Note that these ioctls will not sleep,
+ so that we do not impose a lock.
+ One day we will be forced to put shlock here (I mean SMP)
+ */
+ tryaddrmatch = (sin_orig.sin_family == AF_INET);
+ memset(sin, 0, sizeof(*sin));
+ sin->sin_family = AF_INET;
+ break;
+
+ case SIOCSIFFLAGS:
+ ret = -EACCES;
+ if (!capable(CAP_NET_ADMIN))
+ goto out;
+ break;
+ case SIOCSIFADDR: /* Set interface address (and family) */
+ case SIOCSIFBRDADDR: /* Set the broadcast address */
+ case SIOCSIFDSTADDR: /* Set the destination address */
+ case SIOCSIFNETMASK: /* Set the netmask for the interface */
+ ret = -EACCES;
+ if (!capable(CAP_NET_ADMIN))
+ goto out;
+ ret = -EINVAL;
+ if (sin->sin_family != AF_INET)
+ goto out;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+
+ rtnl_lock();
+
+ ret = -ENODEV;
+ if ((dev = __dev_get_by_name(ifr.ifr_name)) == NULL)
+ goto done;
+
+ if (colon)
+ *colon = ':';
+
+ if ((in_dev = __in_dev_get(dev)) != NULL) {
+ if (tryaddrmatch) {
+ /* Matthias Andree */
+ /* compare label and address (4.4BSD style) */
+ /* note: we only do this for a limited set of ioctls
+ and only if the original address family was AF_INET.
+ This is checked above. */
+ for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+ ifap = &ifa->ifa_next) {
+ if (!strcmp(ifr.ifr_name, ifa->ifa_label) &&
+ sin_orig.sin_addr.s_addr ==
+ ifa->ifa_address) {
+ break; /* found */
+ }
+ }
+ }
+ /* we didn't get a match, maybe the application is
+ 4.3BSD-style and passed in junk so we fall back to
+ comparing just the label */
+ if (!ifa) {
+ for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+ ifap = &ifa->ifa_next)
+ if (!strcmp(ifr.ifr_name, ifa->ifa_label))
+ break;
+ }
+ }
+
+ ret = -EADDRNOTAVAIL;
+ if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS)
+ goto done;
+
+ switch(cmd) {
+ case SIOCGIFADDR: /* Get interface address */
+ sin->sin_addr.s_addr = ifa->ifa_local;
+ goto rarok;
+
+ case SIOCGIFBRDADDR: /* Get the broadcast address */
+ sin->sin_addr.s_addr = ifa->ifa_broadcast;
+ goto rarok;
+
+ case SIOCGIFDSTADDR: /* Get the destination address */
+ sin->sin_addr.s_addr = ifa->ifa_address;
+ goto rarok;
+
+ case SIOCGIFNETMASK: /* Get the netmask for the interface */
+ sin->sin_addr.s_addr = ifa->ifa_mask;
+ goto rarok;
+
+ case SIOCSIFFLAGS:
+ if (colon) {
+ ret = -EADDRNOTAVAIL;
+ if (!ifa)
+ break;
+ ret = 0;
+ if (!(ifr.ifr_flags & IFF_UP))
+ inet_del_ifa(in_dev, ifap, 1);
+ break;
+ }
+ ret = dev_change_flags(dev, ifr.ifr_flags);
+ break;
+
+ case SIOCSIFADDR: /* Set interface address (and family) */
+ ret = -EINVAL;
+ if (inet_abc_len(sin->sin_addr.s_addr) < 0)
+ break;
+
+ if (!ifa) {
+ ret = -ENOBUFS;
+ if ((ifa = inet_alloc_ifa()) == NULL)
+ break;
+ if (colon)
+ memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);
+ else
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ } else {
+ ret = 0;
+ if (ifa->ifa_local == sin->sin_addr.s_addr)
+ break;
+ inet_del_ifa(in_dev, ifap, 0);
+ ifa->ifa_broadcast = 0;
+ ifa->ifa_anycast = 0;
+ }
+
+ ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr;
+
+ if (!(dev->flags & IFF_POINTOPOINT)) {
+ ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address);
+ ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen);
+ if ((dev->flags & IFF_BROADCAST) &&
+ ifa->ifa_prefixlen < 31)
+ ifa->ifa_broadcast = ifa->ifa_address |
+ ~ifa->ifa_mask;
+ } else {
+ ifa->ifa_prefixlen = 32;
+ ifa->ifa_mask = inet_make_mask(32);
+ }
+ ret = inet_set_ifa(dev, ifa);
+ break;
+
+ case SIOCSIFBRDADDR: /* Set the broadcast address */
+ ret = 0;
+ if (ifa->ifa_broadcast != sin->sin_addr.s_addr) {
+ inet_del_ifa(in_dev, ifap, 0);
+ ifa->ifa_broadcast = sin->sin_addr.s_addr;
+ inet_insert_ifa(ifa);
+ }
+ break;
+
+ case SIOCSIFDSTADDR: /* Set the destination address */
+ ret = 0;
+ if (ifa->ifa_address == sin->sin_addr.s_addr)
+ break;
+ ret = -EINVAL;
+ if (inet_abc_len(sin->sin_addr.s_addr) < 0)
+ break;
+ ret = 0;
+ inet_del_ifa(in_dev, ifap, 0);
+ ifa->ifa_address = sin->sin_addr.s_addr;
+ inet_insert_ifa(ifa);
+ break;
+
+ case SIOCSIFNETMASK: /* Set the netmask for the interface */
+
+ /*
+ * The mask we set must be legal.
+ */
+ ret = -EINVAL;
+ if (bad_mask(sin->sin_addr.s_addr, 0))
+ break;
+ ret = 0;
+ if (ifa->ifa_mask != sin->sin_addr.s_addr) {
+ inet_del_ifa(in_dev, ifap, 0);
+ ifa->ifa_mask = sin->sin_addr.s_addr;
+ ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask);
+
+ /* See if current broadcast address matches
+ * with current netmask, then recalculate
+ * the broadcast address. Otherwise it's a
+ * funny address, so don't touch it since
+ * the user seems to know what (s)he's doing...
+ */
+ if ((dev->flags & IFF_BROADCAST) &&
+ (ifa->ifa_prefixlen < 31) &&
+ (ifa->ifa_broadcast ==
+ (ifa->ifa_local|~ifa->ifa_mask))) {
+ ifa->ifa_broadcast = (ifa->ifa_local |
+ ~sin->sin_addr.s_addr);
+ }
+ inet_insert_ifa(ifa);
+ }
+ break;
+ }
+done:
+ rtnl_unlock();
+out:
+ return ret;
+rarok:
+ rtnl_unlock();
+ ret = copy_to_user(arg, &ifr, sizeof(struct ifreq)) ? -EFAULT : 0;
+ goto out;
+}
+
+static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
+{
+ struct in_device *in_dev = __in_dev_get(dev);
+ struct in_ifaddr *ifa;
+ struct ifreq ifr;
+ int done = 0;
+
+ if (!in_dev || (ifa = in_dev->ifa_list) == NULL)
+ goto out;
+
+ for (; ifa; ifa = ifa->ifa_next) {
+ if (!buf) {
+ done += sizeof(ifr);
+ continue;
+ }
+ if (len < (int) sizeof(ifr))
+ break;
+ memset(&ifr, 0, sizeof(struct ifreq));
+ if (ifa->ifa_label)
+ strcpy(ifr.ifr_name, ifa->ifa_label);
+ else
+ strcpy(ifr.ifr_name, dev->name);
+
+ (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET;
+ (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr =
+ ifa->ifa_local;
+
+ if (copy_to_user(buf, &ifr, sizeof(struct ifreq))) {
+ done = -EFAULT;
+ break;
+ }
+ buf += sizeof(struct ifreq);
+ len -= sizeof(struct ifreq);
+ done += sizeof(struct ifreq);
+ }
+out:
+ return done;
+}
+
+u32 inet_select_addr(const struct net_device *dev, u32 dst, int scope)
+{
+ u32 addr = 0;
+ struct in_device *in_dev;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get(dev);
+ if (!in_dev)
+ goto no_in_dev;
+
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_scope > scope)
+ continue;
+ if (!dst || inet_ifa_match(dst, ifa)) {
+ addr = ifa->ifa_local;
+ break;
+ }
+ if (!addr)
+ addr = ifa->ifa_local;
+ } endfor_ifa(in_dev);
+no_in_dev:
+ rcu_read_unlock();
+
+ if (addr)
+ goto out;
+
+ /* Not loopback addresses on loopback should be preferred
+ in this case. It is importnat that lo is the first interface
+ in dev_base list.
+ */
+ read_lock(&dev_base_lock);
+ rcu_read_lock();
+ for (dev = dev_base; dev; dev = dev->next) {
+ if ((in_dev = __in_dev_get(dev)) == NULL)
+ continue;
+
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_scope != RT_SCOPE_LINK &&
+ ifa->ifa_scope <= scope) {
+ addr = ifa->ifa_local;
+ goto out_unlock_both;
+ }
+ } endfor_ifa(in_dev);
+ }
+out_unlock_both:
+ read_unlock(&dev_base_lock);
+ rcu_read_unlock();
+out:
+ return addr;
+}
+
+static u32 confirm_addr_indev(struct in_device *in_dev, u32 dst,
+ u32 local, int scope)
+{
+ int same = 0;
+ u32 addr = 0;
+
+ for_ifa(in_dev) {
+ if (!addr &&
+ (local == ifa->ifa_local || !local) &&
+ ifa->ifa_scope <= scope) {
+ addr = ifa->ifa_local;
+ if (same)
+ break;
+ }
+ if (!same) {
+ same = (!local || inet_ifa_match(local, ifa)) &&
+ (!dst || inet_ifa_match(dst, ifa));
+ if (same && addr) {
+ if (local || !dst)
+ break;
+ /* Is the selected addr into dst subnet? */
+ if (inet_ifa_match(addr, ifa))
+ break;
+ /* No, then can we use new local src? */
+ if (ifa->ifa_scope <= scope) {
+ addr = ifa->ifa_local;
+ break;
+ }
+ /* search for large dst subnet for addr */
+ same = 0;
+ }
+ }
+ } endfor_ifa(in_dev);
+
+ return same? addr : 0;
+}
+
+/*
+ * Confirm that local IP address exists using wildcards:
+ * - dev: only on this interface, 0=any interface
+ * - dst: only in the same subnet as dst, 0=any dst
+ * - local: address, 0=autoselect the local address
+ * - scope: maximum allowed scope value for the local address
+ */
+u32 inet_confirm_addr(const struct net_device *dev, u32 dst, u32 local, int scope)
+{
+ u32 addr = 0;
+ struct in_device *in_dev;
+
+ if (dev) {
+ rcu_read_lock();
+ if ((in_dev = __in_dev_get(dev)))
+ addr = confirm_addr_indev(in_dev, dst, local, scope);
+ rcu_read_unlock();
+
+ return addr;
+ }
+
+ read_lock(&dev_base_lock);
+ rcu_read_lock();
+ for (dev = dev_base; dev; dev = dev->next) {
+ if ((in_dev = __in_dev_get(dev))) {
+ addr = confirm_addr_indev(in_dev, dst, local, scope);
+ if (addr)
+ break;
+ }
+ }
+ rcu_read_unlock();
+ read_unlock(&dev_base_lock);
+
+ return addr;
+}
+
+/*
+ * Device notifier
+ */
+
+int register_inetaddr_notifier(struct notifier_block *nb)
+{
+ return notifier_chain_register(&inetaddr_chain, nb);
+}
+
+int unregister_inetaddr_notifier(struct notifier_block *nb)
+{
+ return notifier_chain_unregister(&inetaddr_chain, nb);
+}
+
+/* Rename ifa_labels for a device name change. Make some effort to preserve existing
+ * alias numbering and to create unique labels if possible.
+*/
+static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
+{
+ struct in_ifaddr *ifa;
+ int named = 0;
+
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ char old[IFNAMSIZ], *dot;
+
+ memcpy(old, ifa->ifa_label, IFNAMSIZ);
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ if (named++ == 0)
+ continue;
+ dot = strchr(ifa->ifa_label, ':');
+ if (dot == NULL) {
+ sprintf(old, ":%d", named);
+ dot = old;
+ }
+ if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) {
+ strcat(ifa->ifa_label, dot);
+ } else {
+ strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot);
+ }
+ }
+}
+
+/* Called only under RTNL semaphore */
+
+static int inetdev_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = ptr;
+ struct in_device *in_dev = __in_dev_get(dev);
+
+ ASSERT_RTNL();
+
+ if (!in_dev) {
+ if (event == NETDEV_REGISTER && dev == &loopback_dev) {
+ in_dev = inetdev_init(dev);
+ if (!in_dev)
+ panic("devinet: Failed to create loopback\n");
+ in_dev->cnf.no_xfrm = 1;
+ in_dev->cnf.no_policy = 1;
+ }
+ goto out;
+ }
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ printk(KERN_DEBUG "inetdev_event: bug\n");
+ dev->ip_ptr = NULL;
+ break;
+ case NETDEV_UP:
+ if (dev->mtu < 68)
+ break;
+ if (dev == &loopback_dev) {
+ struct in_ifaddr *ifa;
+ if ((ifa = inet_alloc_ifa()) != NULL) {
+ ifa->ifa_local =
+ ifa->ifa_address = htonl(INADDR_LOOPBACK);
+ ifa->ifa_prefixlen = 8;
+ ifa->ifa_mask = inet_make_mask(8);
+ in_dev_hold(in_dev);
+ ifa->ifa_dev = in_dev;
+ ifa->ifa_scope = RT_SCOPE_HOST;
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ inet_insert_ifa(ifa);
+ }
+ }
+ ip_mc_up(in_dev);
+ break;
+ case NETDEV_DOWN:
+ ip_mc_down(in_dev);
+ break;
+ case NETDEV_CHANGEMTU:
+ if (dev->mtu >= 68)
+ break;
+ /* MTU falled under 68, disable IP */
+ case NETDEV_UNREGISTER:
+ inetdev_destroy(in_dev);
+ break;
+ case NETDEV_CHANGENAME:
+ /* Do not notify about label change, this event is
+ * not interesting to applications using netlink.
+ */
+ inetdev_changename(dev, in_dev);
+
+#ifdef CONFIG_SYSCTL
+ devinet_sysctl_unregister(&in_dev->cnf);
+ neigh_sysctl_unregister(in_dev->arp_parms);
+ neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4,
+ NET_IPV4_NEIGH, "ipv4", NULL, NULL);
+ devinet_sysctl_register(in_dev, &in_dev->cnf);
+#endif
+ break;
+ }
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block ip_netdev_notifier = {
+ .notifier_call =inetdev_event,
+};
+
+static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
+ u32 pid, u32 seq, int event)
+{
+ struct ifaddrmsg *ifm;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb->tail;
+
+ nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm));
+ if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
+ ifm = NLMSG_DATA(nlh);
+ ifm->ifa_family = AF_INET;
+ ifm->ifa_prefixlen = ifa->ifa_prefixlen;
+ ifm->ifa_flags = ifa->ifa_flags|IFA_F_PERMANENT;
+ ifm->ifa_scope = ifa->ifa_scope;
+ ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
+ if (ifa->ifa_address)
+ RTA_PUT(skb, IFA_ADDRESS, 4, &ifa->ifa_address);
+ if (ifa->ifa_local)
+ RTA_PUT(skb, IFA_LOCAL, 4, &ifa->ifa_local);
+ if (ifa->ifa_broadcast)
+ RTA_PUT(skb, IFA_BROADCAST, 4, &ifa->ifa_broadcast);
+ if (ifa->ifa_anycast)
+ RTA_PUT(skb, IFA_ANYCAST, 4, &ifa->ifa_anycast);
+ if (ifa->ifa_label[0])
+ RTA_PUT(skb, IFA_LABEL, IFNAMSIZ, &ifa->ifa_label);
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+nlmsg_failure:
+rtattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int idx, ip_idx;
+ struct net_device *dev;
+ struct in_device *in_dev;
+ struct in_ifaddr *ifa;
+ int s_ip_idx, s_idx = cb->args[0];
+
+ s_ip_idx = ip_idx = cb->args[1];
+ read_lock(&dev_base_lock);
+ for (dev = dev_base, idx = 0; dev; dev = dev->next, idx++) {
+ if (idx < s_idx)
+ continue;
+ if (idx > s_idx)
+ s_ip_idx = 0;
+ rcu_read_lock();
+ if ((in_dev = __in_dev_get(dev)) == NULL) {
+ rcu_read_unlock();
+ continue;
+ }
+
+ for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
+ ifa = ifa->ifa_next, ip_idx++) {
+ if (ip_idx < s_ip_idx)
+ continue;
+ if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWADDR) <= 0) {
+ rcu_read_unlock();
+ goto done;
+ }
+ }
+ rcu_read_unlock();
+ }
+
+done:
+ read_unlock(&dev_base_lock);
+ cb->args[0] = idx;
+ cb->args[1] = ip_idx;
+
+ return skb->len;
+}
+
+static void rtmsg_ifa(int event, struct in_ifaddr* ifa)
+{
+ int size = NLMSG_SPACE(sizeof(struct ifaddrmsg) + 128);
+ struct sk_buff *skb = alloc_skb(size, GFP_KERNEL);
+
+ if (!skb)
+ netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS);
+ else if (inet_fill_ifaddr(skb, ifa, 0, 0, event) < 0) {
+ kfree_skb(skb);
+ netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL);
+ } else {
+ NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_IFADDR;
+ netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV4_IFADDR, GFP_KERNEL);
+ }
+}
+
+static struct rtnetlink_link inet_rtnetlink_table[RTM_MAX - RTM_BASE + 1] = {
+ [4] = { .doit = inet_rtm_newaddr, },
+ [5] = { .doit = inet_rtm_deladdr, },
+ [6] = { .dumpit = inet_dump_ifaddr, },
+ [8] = { .doit = inet_rtm_newroute, },
+ [9] = { .doit = inet_rtm_delroute, },
+ [10] = { .doit = inet_rtm_getroute, .dumpit = inet_dump_fib, },
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ [16] = { .doit = inet_rtm_newrule, },
+ [17] = { .doit = inet_rtm_delrule, },
+ [18] = { .dumpit = inet_dump_rules, },
+#endif
+};
+
+#ifdef CONFIG_SYSCTL
+
+void inet_forward_change(void)
+{
+ struct net_device *dev;
+ int on = ipv4_devconf.forwarding;
+
+ ipv4_devconf.accept_redirects = !on;
+ ipv4_devconf_dflt.forwarding = on;
+
+ read_lock(&dev_base_lock);
+ for (dev = dev_base; dev; dev = dev->next) {
+ struct in_device *in_dev;
+ rcu_read_lock();
+ in_dev = __in_dev_get(dev);
+ if (in_dev)
+ in_dev->cnf.forwarding = on;
+ rcu_read_unlock();
+ }
+ read_unlock(&dev_base_lock);
+
+ rt_cache_flush(0);
+}
+
+static int devinet_sysctl_forward(ctl_table *ctl, int write,
+ struct file* filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int *valp = ctl->data;
+ int val = *valp;
+ int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+
+ if (write && *valp != val) {
+ if (valp == &ipv4_devconf.forwarding)
+ inet_forward_change();
+ else if (valp != &ipv4_devconf_dflt.forwarding)
+ rt_cache_flush(0);
+ }
+
+ return ret;
+}
+
+int ipv4_doint_and_flush(ctl_table *ctl, int write,
+ struct file* filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int *valp = ctl->data;
+ int val = *valp;
+ int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+
+ if (write && *valp != val)
+ rt_cache_flush(0);
+
+ return ret;
+}
+
+int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen,
+ void **context)
+{
+ int *valp = table->data;
+ int new;
+
+ if (!newval || !newlen)
+ return 0;
+
+ if (newlen != sizeof(int))
+ return -EINVAL;
+
+ if (get_user(new, (int __user *)newval))
+ return -EFAULT;
+
+ if (new == *valp)
+ return 0;
+
+ if (oldval && oldlenp) {
+ size_t len;
+
+ if (get_user(len, oldlenp))
+ return -EFAULT;
+
+ if (len) {
+ if (len > table->maxlen)
+ len = table->maxlen;
+ if (copy_to_user(oldval, valp, len))
+ return -EFAULT;
+ if (put_user(len, oldlenp))
+ return -EFAULT;
+ }
+ }
+
+ *valp = new;
+ rt_cache_flush(0);
+ return 1;
+}
+
+
+static struct devinet_sysctl_table {
+ struct ctl_table_header *sysctl_header;
+ ctl_table devinet_vars[__NET_IPV4_CONF_MAX];
+ ctl_table devinet_dev[2];
+ ctl_table devinet_conf_dir[2];
+ ctl_table devinet_proto_dir[2];
+ ctl_table devinet_root_dir[2];
+} devinet_sysctl = {
+ .devinet_vars = {
+ {
+ .ctl_name = NET_IPV4_CONF_FORWARDING,
+ .procname = "forwarding",
+ .data = &ipv4_devconf.forwarding,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &devinet_sysctl_forward,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_MC_FORWARDING,
+ .procname = "mc_forwarding",
+ .data = &ipv4_devconf.mc_forwarding,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_ACCEPT_REDIRECTS,
+ .procname = "accept_redirects",
+ .data = &ipv4_devconf.accept_redirects,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_SECURE_REDIRECTS,
+ .procname = "secure_redirects",
+ .data = &ipv4_devconf.secure_redirects,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_SHARED_MEDIA,
+ .procname = "shared_media",
+ .data = &ipv4_devconf.shared_media,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_RP_FILTER,
+ .procname = "rp_filter",
+ .data = &ipv4_devconf.rp_filter,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_SEND_REDIRECTS,
+ .procname = "send_redirects",
+ .data = &ipv4_devconf.send_redirects,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE,
+ .procname = "accept_source_route",
+ .data = &ipv4_devconf.accept_source_route,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_PROXY_ARP,
+ .procname = "proxy_arp",
+ .data = &ipv4_devconf.proxy_arp,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_MEDIUM_ID,
+ .procname = "medium_id",
+ .data = &ipv4_devconf.medium_id,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_BOOTP_RELAY,
+ .procname = "bootp_relay",
+ .data = &ipv4_devconf.bootp_relay,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_LOG_MARTIANS,
+ .procname = "log_martians",
+ .data = &ipv4_devconf.log_martians,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_TAG,
+ .procname = "tag",
+ .data = &ipv4_devconf.tag,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_ARPFILTER,
+ .procname = "arp_filter",
+ .data = &ipv4_devconf.arp_filter,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_ARP_ANNOUNCE,
+ .procname = "arp_announce",
+ .data = &ipv4_devconf.arp_announce,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_ARP_IGNORE,
+ .procname = "arp_ignore",
+ .data = &ipv4_devconf.arp_ignore,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_NOXFRM,
+ .procname = "disable_xfrm",
+ .data = &ipv4_devconf.no_xfrm,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &ipv4_doint_and_flush,
+ .strategy = &ipv4_doint_and_flush_strategy,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_NOPOLICY,
+ .procname = "disable_policy",
+ .data = &ipv4_devconf.no_policy,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &ipv4_doint_and_flush,
+ .strategy = &ipv4_doint_and_flush_strategy,
+ },
+ {
+ .ctl_name = NET_IPV4_CONF_FORCE_IGMP_VERSION,
+ .procname = "force_igmp_version",
+ .data = &ipv4_devconf.force_igmp_version,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &ipv4_doint_and_flush,
+ .strategy = &ipv4_doint_and_flush_strategy,
+ },
+ },
+ .devinet_dev = {
+ {
+ .ctl_name = NET_PROTO_CONF_ALL,
+ .procname = "all",
+ .mode = 0555,
+ .child = devinet_sysctl.devinet_vars,
+ },
+ },
+ .devinet_conf_dir = {
+ {
+ .ctl_name = NET_IPV4_CONF,
+ .procname = "conf",
+ .mode = 0555,
+ .child = devinet_sysctl.devinet_dev,
+ },
+ },
+ .devinet_proto_dir = {
+ {
+ .ctl_name = NET_IPV4,
+ .procname = "ipv4",
+ .mode = 0555,
+ .child = devinet_sysctl.devinet_conf_dir,
+ },
+ },
+ .devinet_root_dir = {
+ {
+ .ctl_name = CTL_NET,
+ .procname = "net",
+ .mode = 0555,
+ .child = devinet_sysctl.devinet_proto_dir,
+ },
+ },
+};
+
+static void devinet_sysctl_register(struct in_device *in_dev,
+ struct ipv4_devconf *p)
+{
+ int i;
+ struct net_device *dev = in_dev ? in_dev->dev : NULL;
+ struct devinet_sysctl_table *t = kmalloc(sizeof(*t), GFP_KERNEL);
+ char *dev_name = NULL;
+
+ if (!t)
+ return;
+ memcpy(t, &devinet_sysctl, sizeof(*t));
+ for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) {
+ t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf;
+ t->devinet_vars[i].de = NULL;
+ }
+
+ if (dev) {
+ dev_name = dev->name;
+ t->devinet_dev[0].ctl_name = dev->ifindex;
+ } else {
+ dev_name = "default";
+ t->devinet_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT;
+ }
+
+ /*
+ * Make a copy of dev_name, because '.procname' is regarded as const
+ * by sysctl and we wouldn't want anyone to change it under our feet
+ * (see SIOCSIFNAME).
+ */
+ dev_name = net_sysctl_strdup(dev_name);
+ if (!dev_name)
+ goto free;
+
+ t->devinet_dev[0].procname = dev_name;
+ t->devinet_dev[0].child = t->devinet_vars;
+ t->devinet_dev[0].de = NULL;
+ t->devinet_conf_dir[0].child = t->devinet_dev;
+ t->devinet_conf_dir[0].de = NULL;
+ t->devinet_proto_dir[0].child = t->devinet_conf_dir;
+ t->devinet_proto_dir[0].de = NULL;
+ t->devinet_root_dir[0].child = t->devinet_proto_dir;
+ t->devinet_root_dir[0].de = NULL;
+
+ t->sysctl_header = register_sysctl_table(t->devinet_root_dir, 0);
+ if (!t->sysctl_header)
+ goto free_procname;
+
+ p->sysctl = t;
+ return;
+
+ /* error path */
+ free_procname:
+ kfree(dev_name);
+ free:
+ kfree(t);
+ return;
+}
+
+static void devinet_sysctl_unregister(struct ipv4_devconf *p)
+{
+ if (p->sysctl) {
+ struct devinet_sysctl_table *t = p->sysctl;
+ p->sysctl = NULL;
+ unregister_sysctl_table(t->sysctl_header);
+ kfree(t->devinet_dev[0].procname);
+ kfree(t);
+ }
+}
+#endif
+
+void __init devinet_init(void)
+{
+ register_gifconf(PF_INET, inet_gifconf);
+ register_netdevice_notifier(&ip_netdev_notifier);
+ rtnetlink_links[PF_INET] = inet_rtnetlink_table;
+#ifdef CONFIG_SYSCTL
+ devinet_sysctl.sysctl_header =
+ register_sysctl_table(devinet_sysctl.devinet_root_dir, 0);
+ devinet_sysctl_register(NULL, &ipv4_devconf_dflt);
+#endif
+}
+
+EXPORT_SYMBOL(devinet_ioctl);
+EXPORT_SYMBOL(in_dev_finish_destroy);
+EXPORT_SYMBOL(inet_select_addr);
+EXPORT_SYMBOL(inetdev_by_index);
+EXPORT_SYMBOL(register_inetaddr_notifier);
+EXPORT_SYMBOL(unregister_inetaddr_notifier);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
new file mode 100644
index 000000000000..053a883247ba
--- /dev/null
+++ b/net/ipv4/esp4.c
@@ -0,0 +1,510 @@
+#include <linux/config.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+#include <asm/scatterlist.h>
+#include <linux/crypto.h>
+#include <linux/pfkeyv2.h>
+#include <linux/random.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+
+/* decapsulation data for use when post-processing */
+struct esp_decap_data {
+ xfrm_address_t saddr;
+ __u16 sport;
+ __u8 proto;
+};
+
+static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+ struct iphdr *top_iph;
+ struct ip_esp_hdr *esph;
+ struct crypto_tfm *tfm;
+ struct esp_data *esp;
+ struct sk_buff *trailer;
+ int blksize;
+ int clen;
+ int alen;
+ int nfrags;
+
+ /* Strip IP+ESP header. */
+ __skb_pull(skb, skb->h.raw - skb->data);
+ /* Now skb is pure payload to encrypt */
+
+ err = -ENOMEM;
+
+ /* Round to block size */
+ clen = skb->len;
+
+ esp = x->data;
+ alen = esp->auth.icv_trunc_len;
+ tfm = esp->conf.tfm;
+ blksize = (crypto_tfm_alg_blocksize(tfm) + 3) & ~3;
+ clen = (clen + 2 + blksize-1)&~(blksize-1);
+ if (esp->conf.padlen)
+ clen = (clen + esp->conf.padlen-1)&~(esp->conf.padlen-1);
+
+ if ((nfrags = skb_cow_data(skb, clen-skb->len+alen, &trailer)) < 0)
+ goto error;
+
+ /* Fill padding... */
+ do {
+ int i;
+ for (i=0; i<clen-skb->len - 2; i++)
+ *(u8*)(trailer->tail + i) = i+1;
+ } while (0);
+ *(u8*)(trailer->tail + clen-skb->len - 2) = (clen - skb->len)-2;
+ pskb_put(skb, trailer, clen - skb->len);
+
+ __skb_push(skb, skb->data - skb->nh.raw);
+ top_iph = skb->nh.iph;
+ esph = (struct ip_esp_hdr *)(skb->nh.raw + top_iph->ihl*4);
+ top_iph->tot_len = htons(skb->len + alen);
+ *(u8*)(trailer->tail - 1) = top_iph->protocol;
+
+ /* this is non-NULL only with UDP Encapsulation */
+ if (x->encap) {
+ struct xfrm_encap_tmpl *encap = x->encap;
+ struct udphdr *uh;
+ u32 *udpdata32;
+
+ uh = (struct udphdr *)esph;
+ uh->source = encap->encap_sport;
+ uh->dest = encap->encap_dport;
+ uh->len = htons(skb->len + alen - top_iph->ihl*4);
+ uh->check = 0;
+
+ switch (encap->encap_type) {
+ default:
+ case UDP_ENCAP_ESPINUDP:
+ esph = (struct ip_esp_hdr *)(uh + 1);
+ break;
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ udpdata32 = (u32 *)(uh + 1);
+ udpdata32[0] = udpdata32[1] = 0;
+ esph = (struct ip_esp_hdr *)(udpdata32 + 2);
+ break;
+ }
+
+ top_iph->protocol = IPPROTO_UDP;
+ } else
+ top_iph->protocol = IPPROTO_ESP;
+
+ esph->spi = x->id.spi;
+ esph->seq_no = htonl(++x->replay.oseq);
+
+ if (esp->conf.ivlen)
+ crypto_cipher_set_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm));
+
+ do {
+ struct scatterlist *sg = &esp->sgbuf[0];
+
+ if (unlikely(nfrags > ESP_NUM_FAST_SG)) {
+ sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC);
+ if (!sg)
+ goto error;
+ }
+ skb_to_sgvec(skb, sg, esph->enc_data+esp->conf.ivlen-skb->data, clen);
+ crypto_cipher_encrypt(tfm, sg, sg, clen);
+ if (unlikely(sg != &esp->sgbuf[0]))
+ kfree(sg);
+ } while (0);
+
+ if (esp->conf.ivlen) {
+ memcpy(esph->enc_data, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm));
+ crypto_cipher_get_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm));
+ }
+
+ if (esp->auth.icv_full_len) {
+ esp->auth.icv(esp, skb, (u8*)esph-skb->data,
+ sizeof(struct ip_esp_hdr) + esp->conf.ivlen+clen, trailer->tail);
+ pskb_put(skb, trailer, alen);
+ }
+
+ ip_send_check(top_iph);
+
+ err = 0;
+
+error:
+ return err;
+}
+
+/*
+ * Note: detecting truncated vs. non-truncated authentication data is very
+ * expensive, so we only support truncated data, which is the recommended
+ * and common case.
+ */
+static int esp_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
+{
+ struct iphdr *iph;
+ struct ip_esp_hdr *esph;
+ struct esp_data *esp = x->data;
+ struct sk_buff *trailer;
+ int blksize = crypto_tfm_alg_blocksize(esp->conf.tfm);
+ int alen = esp->auth.icv_trunc_len;
+ int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen;
+ int nfrags;
+ int encap_len = 0;
+
+ if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr)))
+ goto out;
+
+ if (elen <= 0 || (elen & (blksize-1)))
+ goto out;
+
+ /* If integrity check is required, do this. */
+ if (esp->auth.icv_full_len) {
+ u8 sum[esp->auth.icv_full_len];
+ u8 sum1[alen];
+
+ esp->auth.icv(esp, skb, 0, skb->len-alen, sum);
+
+ if (skb_copy_bits(skb, skb->len-alen, sum1, alen))
+ BUG();
+
+ if (unlikely(memcmp(sum, sum1, alen))) {
+ x->stats.integrity_failed++;
+ goto out;
+ }
+ }
+
+ if ((nfrags = skb_cow_data(skb, 0, &trailer)) < 0)
+ goto out;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ esph = (struct ip_esp_hdr*)skb->data;
+ iph = skb->nh.iph;
+
+ /* Get ivec. This can be wrong, check against another impls. */
+ if (esp->conf.ivlen)
+ crypto_cipher_set_iv(esp->conf.tfm, esph->enc_data, crypto_tfm_alg_ivsize(esp->conf.tfm));
+
+ {
+ u8 nexthdr[2];
+ struct scatterlist *sg = &esp->sgbuf[0];
+ u8 workbuf[60];
+ int padlen;
+
+ if (unlikely(nfrags > ESP_NUM_FAST_SG)) {
+ sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC);
+ if (!sg)
+ goto out;
+ }
+ skb_to_sgvec(skb, sg, sizeof(struct ip_esp_hdr) + esp->conf.ivlen, elen);
+ crypto_cipher_decrypt(esp->conf.tfm, sg, sg, elen);
+ if (unlikely(sg != &esp->sgbuf[0]))
+ kfree(sg);
+
+ if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2))
+ BUG();
+
+ padlen = nexthdr[0];
+ if (padlen+2 >= elen)
+ goto out;
+
+ /* ... check padding bits here. Silly. :-) */
+
+ if (x->encap && decap && decap->decap_type) {
+ struct esp_decap_data *encap_data;
+ struct udphdr *uh = (struct udphdr *) (iph+1);
+
+ encap_data = (struct esp_decap_data *) (decap->decap_data);
+ encap_data->proto = 0;
+
+ switch (decap->decap_type) {
+ case UDP_ENCAP_ESPINUDP:
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ encap_data->proto = AF_INET;
+ encap_data->saddr.a4 = iph->saddr;
+ encap_data->sport = uh->source;
+ encap_len = (void*)esph - (void*)uh;
+ break;
+
+ default:
+ goto out;
+ }
+ }
+
+ iph->protocol = nexthdr[1];
+ pskb_trim(skb, skb->len - alen - padlen - 2);
+ memcpy(workbuf, skb->nh.raw, iph->ihl*4);
+ skb->h.raw = skb_pull(skb, sizeof(struct ip_esp_hdr) + esp->conf.ivlen);
+ skb->nh.raw += encap_len + sizeof(struct ip_esp_hdr) + esp->conf.ivlen;
+ memcpy(skb->nh.raw, workbuf, iph->ihl*4);
+ skb->nh.iph->tot_len = htons(skb->len);
+ }
+
+ return 0;
+
+out:
+ return -EINVAL;
+}
+
+static int esp_post_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
+{
+
+ if (x->encap) {
+ struct xfrm_encap_tmpl *encap;
+ struct esp_decap_data *decap_data;
+
+ encap = x->encap;
+ decap_data = (struct esp_decap_data *)(decap->decap_data);
+
+ /* first, make sure that the decap type == the encap type */
+ if (encap->encap_type != decap->decap_type)
+ return -EINVAL;
+
+ switch (encap->encap_type) {
+ default:
+ case UDP_ENCAP_ESPINUDP:
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ /*
+ * 1) if the NAT-T peer's IP or port changed then
+ * advertize the change to the keying daemon.
+ * This is an inbound SA, so just compare
+ * SRC ports.
+ */
+ if (decap_data->proto == AF_INET &&
+ (decap_data->saddr.a4 != x->props.saddr.a4 ||
+ decap_data->sport != encap->encap_sport)) {
+ xfrm_address_t ipaddr;
+
+ ipaddr.a4 = decap_data->saddr.a4;
+ km_new_mapping(x, &ipaddr, decap_data->sport);
+
+ /* XXX: perhaps add an extra
+ * policy check here, to see
+ * if we should allow or
+ * reject a packet from a
+ * different source
+ * address/port.
+ */
+ }
+
+ /*
+ * 2) ignore UDP/TCP checksums in case
+ * of NAT-T in Transport Mode, or
+ * perform other post-processing fixes
+ * as per * draft-ietf-ipsec-udp-encaps-06,
+ * section 3.1.2
+ */
+ if (!x->props.mode)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ break;
+ }
+ }
+ return 0;
+}
+
+static u32 esp4_get_max_size(struct xfrm_state *x, int mtu)
+{
+ struct esp_data *esp = x->data;
+ u32 blksize = crypto_tfm_alg_blocksize(esp->conf.tfm);
+
+ if (x->props.mode) {
+ mtu = (mtu + 2 + blksize-1)&~(blksize-1);
+ } else {
+ /* The worst case. */
+ mtu += 2 + blksize;
+ }
+ if (esp->conf.padlen)
+ mtu = (mtu + esp->conf.padlen-1)&~(esp->conf.padlen-1);
+
+ return mtu + x->props.header_len + esp->auth.icv_trunc_len;
+}
+
+static void esp4_err(struct sk_buff *skb, u32 info)
+{
+ struct iphdr *iph = (struct iphdr*)skb->data;
+ struct ip_esp_hdr *esph = (struct ip_esp_hdr*)(skb->data+(iph->ihl<<2));
+ struct xfrm_state *x;
+
+ if (skb->h.icmph->type != ICMP_DEST_UNREACH ||
+ skb->h.icmph->code != ICMP_FRAG_NEEDED)
+ return;
+
+ x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
+ if (!x)
+ return;
+ NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
+ ntohl(esph->spi), ntohl(iph->daddr)));
+ xfrm_state_put(x);
+}
+
+static void esp_destroy(struct xfrm_state *x)
+{
+ struct esp_data *esp = x->data;
+
+ if (!esp)
+ return;
+
+ if (esp->conf.tfm) {
+ crypto_free_tfm(esp->conf.tfm);
+ esp->conf.tfm = NULL;
+ }
+ if (esp->conf.ivec) {
+ kfree(esp->conf.ivec);
+ esp->conf.ivec = NULL;
+ }
+ if (esp->auth.tfm) {
+ crypto_free_tfm(esp->auth.tfm);
+ esp->auth.tfm = NULL;
+ }
+ if (esp->auth.work_icv) {
+ kfree(esp->auth.work_icv);
+ esp->auth.work_icv = NULL;
+ }
+ kfree(esp);
+}
+
+static int esp_init_state(struct xfrm_state *x, void *args)
+{
+ struct esp_data *esp = NULL;
+
+ /* null auth and encryption can have zero length keys */
+ if (x->aalg) {
+ if (x->aalg->alg_key_len > 512)
+ goto error;
+ }
+ if (x->ealg == NULL)
+ goto error;
+
+ esp = kmalloc(sizeof(*esp), GFP_KERNEL);
+ if (esp == NULL)
+ return -ENOMEM;
+
+ memset(esp, 0, sizeof(*esp));
+
+ if (x->aalg) {
+ struct xfrm_algo_desc *aalg_desc;
+
+ esp->auth.key = x->aalg->alg_key;
+ esp->auth.key_len = (x->aalg->alg_key_len+7)/8;
+ esp->auth.tfm = crypto_alloc_tfm(x->aalg->alg_name, 0);
+ if (esp->auth.tfm == NULL)
+ goto error;
+ esp->auth.icv = esp_hmac_digest;
+
+ aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+ BUG_ON(!aalg_desc);
+
+ if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
+ crypto_tfm_alg_digestsize(esp->auth.tfm)) {
+ NETDEBUG(printk(KERN_INFO "ESP: %s digestsize %u != %hu\n",
+ x->aalg->alg_name,
+ crypto_tfm_alg_digestsize(esp->auth.tfm),
+ aalg_desc->uinfo.auth.icv_fullbits/8));
+ goto error;
+ }
+
+ esp->auth.icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
+ esp->auth.icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8;
+
+ esp->auth.work_icv = kmalloc(esp->auth.icv_full_len, GFP_KERNEL);
+ if (!esp->auth.work_icv)
+ goto error;
+ }
+ esp->conf.key = x->ealg->alg_key;
+ esp->conf.key_len = (x->ealg->alg_key_len+7)/8;
+ if (x->props.ealgo == SADB_EALG_NULL)
+ esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_ECB);
+ else
+ esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_CBC);
+ if (esp->conf.tfm == NULL)
+ goto error;
+ esp->conf.ivlen = crypto_tfm_alg_ivsize(esp->conf.tfm);
+ esp->conf.padlen = 0;
+ if (esp->conf.ivlen) {
+ esp->conf.ivec = kmalloc(esp->conf.ivlen, GFP_KERNEL);
+ if (unlikely(esp->conf.ivec == NULL))
+ goto error;
+ get_random_bytes(esp->conf.ivec, esp->conf.ivlen);
+ }
+ if (crypto_cipher_setkey(esp->conf.tfm, esp->conf.key, esp->conf.key_len))
+ goto error;
+ x->props.header_len = sizeof(struct ip_esp_hdr) + esp->conf.ivlen;
+ if (x->props.mode)
+ x->props.header_len += sizeof(struct iphdr);
+ if (x->encap) {
+ struct xfrm_encap_tmpl *encap = x->encap;
+
+ switch (encap->encap_type) {
+ default:
+ goto error;
+ case UDP_ENCAP_ESPINUDP:
+ x->props.header_len += sizeof(struct udphdr);
+ break;
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32);
+ break;
+ }
+ }
+ x->data = esp;
+ x->props.trailer_len = esp4_get_max_size(x, 0) - x->props.header_len;
+ return 0;
+
+error:
+ x->data = esp;
+ esp_destroy(x);
+ x->data = NULL;
+ return -EINVAL;
+}
+
+static struct xfrm_type esp_type =
+{
+ .description = "ESP4",
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_ESP,
+ .init_state = esp_init_state,
+ .destructor = esp_destroy,
+ .get_max_size = esp4_get_max_size,
+ .input = esp_input,
+ .post_input = esp_post_input,
+ .output = esp_output
+};
+
+static struct net_protocol esp4_protocol = {
+ .handler = xfrm4_rcv,
+ .err_handler = esp4_err,
+ .no_policy = 1,
+};
+
+static int __init esp4_init(void)
+{
+ struct xfrm_decap_state decap;
+
+ if (sizeof(struct esp_decap_data) <
+ sizeof(decap.decap_data)) {
+ extern void decap_data_too_small(void);
+
+ decap_data_too_small();
+ }
+
+ if (xfrm_register_type(&esp_type, AF_INET) < 0) {
+ printk(KERN_INFO "ip esp init: can't add xfrm type\n");
+ return -EAGAIN;
+ }
+ if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) {
+ printk(KERN_INFO "ip esp init: can't add protocol\n");
+ xfrm_unregister_type(&esp_type, AF_INET);
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static void __exit esp4_fini(void)
+{
+ if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0)
+ printk(KERN_INFO "ip esp close: can't remove protocol\n");
+ if (xfrm_unregister_type(&esp_type, AF_INET) < 0)
+ printk(KERN_INFO "ip esp close: can't remove xfrm type\n");
+}
+
+module_init(esp4_init);
+module_exit(esp4_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
new file mode 100644
index 000000000000..563e7d612706
--- /dev/null
+++ b/net/ipv4/fib_frontend.c
@@ -0,0 +1,611 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 Forwarding Information Base: FIB frontend.
+ *
+ * Version: $Id: fib_frontend.c,v 1.26 2001/10/31 21:55:54 davem Exp $
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/arp.h>
+#include <net/ip_fib.h>
+
+#define FFprint(a...) printk(KERN_DEBUG a)
+
+#ifndef CONFIG_IP_MULTIPLE_TABLES
+
+#define RT_TABLE_MIN RT_TABLE_MAIN
+
+struct fib_table *ip_fib_local_table;
+struct fib_table *ip_fib_main_table;
+
+#else
+
+#define RT_TABLE_MIN 1
+
+struct fib_table *fib_tables[RT_TABLE_MAX+1];
+
+struct fib_table *__fib_new_table(int id)
+{
+ struct fib_table *tb;
+
+ tb = fib_hash_init(id);
+ if (!tb)
+ return NULL;
+ fib_tables[id] = tb;
+ return tb;
+}
+
+
+#endif /* CONFIG_IP_MULTIPLE_TABLES */
+
+
+static void fib_flush(void)
+{
+ int flushed = 0;
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ struct fib_table *tb;
+ int id;
+
+ for (id = RT_TABLE_MAX; id>0; id--) {
+ if ((tb = fib_get_table(id))==NULL)
+ continue;
+ flushed += tb->tb_flush(tb);
+ }
+#else /* CONFIG_IP_MULTIPLE_TABLES */
+ flushed += ip_fib_main_table->tb_flush(ip_fib_main_table);
+ flushed += ip_fib_local_table->tb_flush(ip_fib_local_table);
+#endif /* CONFIG_IP_MULTIPLE_TABLES */
+
+ if (flushed)
+ rt_cache_flush(-1);
+}
+
+/*
+ * Find the first device with a given source address.
+ */
+
+struct net_device * ip_dev_find(u32 addr)
+{
+ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
+ struct fib_result res;
+ struct net_device *dev = NULL;
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ res.r = NULL;
+#endif
+
+ if (!ip_fib_local_table ||
+ ip_fib_local_table->tb_lookup(ip_fib_local_table, &fl, &res))
+ return NULL;
+ if (res.type != RTN_LOCAL)
+ goto out;
+ dev = FIB_RES_DEV(res);
+
+ if (dev)
+ dev_hold(dev);
+out:
+ fib_res_put(&res);
+ return dev;
+}
+
+unsigned inet_addr_type(u32 addr)
+{
+ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
+ struct fib_result res;
+ unsigned ret = RTN_BROADCAST;
+
+ if (ZERONET(addr) || BADCLASS(addr))
+ return RTN_BROADCAST;
+ if (MULTICAST(addr))
+ return RTN_MULTICAST;
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ res.r = NULL;
+#endif
+
+ if (ip_fib_local_table) {
+ ret = RTN_UNICAST;
+ if (!ip_fib_local_table->tb_lookup(ip_fib_local_table,
+ &fl, &res)) {
+ ret = res.type;
+ fib_res_put(&res);
+ }
+ }
+ return ret;
+}
+
+/* Given (packet source, input interface) and optional (dst, oif, tos):
+ - (main) check, that source is valid i.e. not broadcast or our local
+ address.
+ - figure out what "logical" interface this packet arrived
+ and calculate "specific destination" address.
+ - check, that packet arrived from expected physical interface.
+ */
+
+int fib_validate_source(u32 src, u32 dst, u8 tos, int oif,
+ struct net_device *dev, u32 *spec_dst, u32 *itag)
+{
+ struct in_device *in_dev;
+ struct flowi fl = { .nl_u = { .ip4_u =
+ { .daddr = src,
+ .saddr = dst,
+ .tos = tos } },
+ .iif = oif };
+ struct fib_result res;
+ int no_addr, rpf;
+ int ret;
+
+ no_addr = rpf = 0;
+ rcu_read_lock();
+ in_dev = __in_dev_get(dev);
+ if (in_dev) {
+ no_addr = in_dev->ifa_list == NULL;
+ rpf = IN_DEV_RPFILTER(in_dev);
+ }
+ rcu_read_unlock();
+
+ if (in_dev == NULL)
+ goto e_inval;
+
+ if (fib_lookup(&fl, &res))
+ goto last_resort;
+ if (res.type != RTN_UNICAST)
+ goto e_inval_res;
+ *spec_dst = FIB_RES_PREFSRC(res);
+ fib_combine_itag(itag, &res);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1)
+#else
+ if (FIB_RES_DEV(res) == dev)
+#endif
+ {
+ ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+ fib_res_put(&res);
+ return ret;
+ }
+ fib_res_put(&res);
+ if (no_addr)
+ goto last_resort;
+ if (rpf)
+ goto e_inval;
+ fl.oif = dev->ifindex;
+
+ ret = 0;
+ if (fib_lookup(&fl, &res) == 0) {
+ if (res.type == RTN_UNICAST) {
+ *spec_dst = FIB_RES_PREFSRC(res);
+ ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+ }
+ fib_res_put(&res);
+ }
+ return ret;
+
+last_resort:
+ if (rpf)
+ goto e_inval;
+ *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+ *itag = 0;
+ return 0;
+
+e_inval_res:
+ fib_res_put(&res);
+e_inval:
+ return -EINVAL;
+}
+
+#ifndef CONFIG_IP_NOSIOCRT
+
+/*
+ * Handle IP routing ioctl calls. These are used to manipulate the routing tables
+ */
+
+int ip_rt_ioctl(unsigned int cmd, void __user *arg)
+{
+ int err;
+ struct kern_rta rta;
+ struct rtentry r;
+ struct {
+ struct nlmsghdr nlh;
+ struct rtmsg rtm;
+ } req;
+
+ switch (cmd) {
+ case SIOCADDRT: /* Add a route */
+ case SIOCDELRT: /* Delete a route */
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (copy_from_user(&r, arg, sizeof(struct rtentry)))
+ return -EFAULT;
+ rtnl_lock();
+ err = fib_convert_rtentry(cmd, &req.nlh, &req.rtm, &rta, &r);
+ if (err == 0) {
+ if (cmd == SIOCDELRT) {
+ struct fib_table *tb = fib_get_table(req.rtm.rtm_table);
+ err = -ESRCH;
+ if (tb)
+ err = tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL);
+ } else {
+ struct fib_table *tb = fib_new_table(req.rtm.rtm_table);
+ err = -ENOBUFS;
+ if (tb)
+ err = tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL);
+ }
+ if (rta.rta_mx)
+ kfree(rta.rta_mx);
+ }
+ rtnl_unlock();
+ return err;
+ }
+ return -EINVAL;
+}
+
+#else
+
+int ip_rt_ioctl(unsigned int cmd, void *arg)
+{
+ return -EINVAL;
+}
+
+#endif
+
+static int inet_check_attr(struct rtmsg *r, struct rtattr **rta)
+{
+ int i;
+
+ for (i=1; i<=RTA_MAX; i++) {
+ struct rtattr *attr = rta[i-1];
+ if (attr) {
+ if (RTA_PAYLOAD(attr) < 4)
+ return -EINVAL;
+ if (i != RTA_MULTIPATH && i != RTA_METRICS)
+ rta[i-1] = (struct rtattr*)RTA_DATA(attr);
+ }
+ }
+ return 0;
+}
+
+int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+ struct fib_table * tb;
+ struct rtattr **rta = arg;
+ struct rtmsg *r = NLMSG_DATA(nlh);
+
+ if (inet_check_attr(r, rta))
+ return -EINVAL;
+
+ tb = fib_get_table(r->rtm_table);
+ if (tb)
+ return tb->tb_delete(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb));
+ return -ESRCH;
+}
+
+int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+ struct fib_table * tb;
+ struct rtattr **rta = arg;
+ struct rtmsg *r = NLMSG_DATA(nlh);
+
+ if (inet_check_attr(r, rta))
+ return -EINVAL;
+
+ tb = fib_new_table(r->rtm_table);
+ if (tb)
+ return tb->tb_insert(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb));
+ return -ENOBUFS;
+}
+
+int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int t;
+ int s_t;
+ struct fib_table *tb;
+
+ if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) &&
+ ((struct rtmsg*)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED)
+ return ip_rt_dump(skb, cb);
+
+ s_t = cb->args[0];
+ if (s_t == 0)
+ s_t = cb->args[0] = RT_TABLE_MIN;
+
+ for (t=s_t; t<=RT_TABLE_MAX; t++) {
+ if (t < s_t) continue;
+ if (t > s_t)
+ memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
+ if ((tb = fib_get_table(t))==NULL)
+ continue;
+ if (tb->tb_dump(tb, skb, cb) < 0)
+ break;
+ }
+
+ cb->args[0] = t;
+
+ return skb->len;
+}
+
+/* Prepare and feed intra-kernel routing request.
+ Really, it should be netlink message, but :-( netlink
+ can be not configured, so that we feed it directly
+ to fib engine. It is legal, because all events occur
+ only when netlink is already locked.
+ */
+
+static void fib_magic(int cmd, int type, u32 dst, int dst_len, struct in_ifaddr *ifa)
+{
+ struct fib_table * tb;
+ struct {
+ struct nlmsghdr nlh;
+ struct rtmsg rtm;
+ } req;
+ struct kern_rta rta;
+
+ memset(&req.rtm, 0, sizeof(req.rtm));
+ memset(&rta, 0, sizeof(rta));
+
+ if (type == RTN_UNICAST)
+ tb = fib_new_table(RT_TABLE_MAIN);
+ else
+ tb = fib_new_table(RT_TABLE_LOCAL);
+
+ if (tb == NULL)
+ return;
+
+ req.nlh.nlmsg_len = sizeof(req);
+ req.nlh.nlmsg_type = cmd;
+ req.nlh.nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE|NLM_F_APPEND;
+ req.nlh.nlmsg_pid = 0;
+ req.nlh.nlmsg_seq = 0;
+
+ req.rtm.rtm_dst_len = dst_len;
+ req.rtm.rtm_table = tb->tb_id;
+ req.rtm.rtm_protocol = RTPROT_KERNEL;
+ req.rtm.rtm_scope = (type != RTN_LOCAL ? RT_SCOPE_LINK : RT_SCOPE_HOST);
+ req.rtm.rtm_type = type;
+
+ rta.rta_dst = &dst;
+ rta.rta_prefsrc = &ifa->ifa_local;
+ rta.rta_oif = &ifa->ifa_dev->dev->ifindex;
+
+ if (cmd == RTM_NEWROUTE)
+ tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL);
+ else
+ tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL);
+}
+
+static void fib_add_ifaddr(struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct net_device *dev = in_dev->dev;
+ struct in_ifaddr *prim = ifa;
+ u32 mask = ifa->ifa_mask;
+ u32 addr = ifa->ifa_local;
+ u32 prefix = ifa->ifa_address&mask;
+
+ if (ifa->ifa_flags&IFA_F_SECONDARY) {
+ prim = inet_ifa_byprefix(in_dev, prefix, mask);
+ if (prim == NULL) {
+ printk(KERN_DEBUG "fib_add_ifaddr: bug: prim == NULL\n");
+ return;
+ }
+ }
+
+ fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
+
+ if (!(dev->flags&IFF_UP))
+ return;
+
+ /* Add broadcast address, if it is explicitly assigned. */
+ if (ifa->ifa_broadcast && ifa->ifa_broadcast != 0xFFFFFFFF)
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+
+ if (!ZERONET(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) &&
+ (prefix != addr || ifa->ifa_prefixlen < 32)) {
+ fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
+ RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim);
+
+ /* Add network specific broadcasts, when it takes a sense */
+ if (ifa->ifa_prefixlen < 31) {
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim);
+ }
+ }
+}
+
+static void fib_del_ifaddr(struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct net_device *dev = in_dev->dev;
+ struct in_ifaddr *ifa1;
+ struct in_ifaddr *prim = ifa;
+ u32 brd = ifa->ifa_address|~ifa->ifa_mask;
+ u32 any = ifa->ifa_address&ifa->ifa_mask;
+#define LOCAL_OK 1
+#define BRD_OK 2
+#define BRD0_OK 4
+#define BRD1_OK 8
+ unsigned ok = 0;
+
+ if (!(ifa->ifa_flags&IFA_F_SECONDARY))
+ fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
+ RTN_UNICAST, any, ifa->ifa_prefixlen, prim);
+ else {
+ prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
+ if (prim == NULL) {
+ printk(KERN_DEBUG "fib_del_ifaddr: bug: prim == NULL\n");
+ return;
+ }
+ }
+
+ /* Deletion is more complicated than add.
+ We should take care of not to delete too much :-)
+
+ Scan address list to be sure that addresses are really gone.
+ */
+
+ for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
+ if (ifa->ifa_local == ifa1->ifa_local)
+ ok |= LOCAL_OK;
+ if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
+ ok |= BRD_OK;
+ if (brd == ifa1->ifa_broadcast)
+ ok |= BRD1_OK;
+ if (any == ifa1->ifa_broadcast)
+ ok |= BRD0_OK;
+ }
+
+ if (!(ok&BRD_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+ if (!(ok&BRD1_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
+ if (!(ok&BRD0_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
+ if (!(ok&LOCAL_OK)) {
+ fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
+
+ /* Check, that this local address finally disappeared. */
+ if (inet_addr_type(ifa->ifa_local) != RTN_LOCAL) {
+ /* And the last, but not the least thing.
+ We must flush stray FIB entries.
+
+ First of all, we scan fib_info list searching
+ for stray nexthop entries, then ignite fib_flush.
+ */
+ if (fib_sync_down(ifa->ifa_local, NULL, 0))
+ fib_flush();
+ }
+ }
+#undef LOCAL_OK
+#undef BRD_OK
+#undef BRD0_OK
+#undef BRD1_OK
+}
+
+static void fib_disable_ip(struct net_device *dev, int force)
+{
+ if (fib_sync_down(0, dev, force))
+ fib_flush();
+ rt_cache_flush(0);
+ arp_ifdown(dev);
+}
+
+static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct in_ifaddr *ifa = (struct in_ifaddr*)ptr;
+
+ switch (event) {
+ case NETDEV_UP:
+ fib_add_ifaddr(ifa);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ fib_sync_up(ifa->ifa_dev->dev);
+#endif
+ rt_cache_flush(-1);
+ break;
+ case NETDEV_DOWN:
+ fib_del_ifaddr(ifa);
+ if (ifa->ifa_dev && ifa->ifa_dev->ifa_list == NULL) {
+ /* Last address was deleted from this interface.
+ Disable IP.
+ */
+ fib_disable_ip(ifa->ifa_dev->dev, 1);
+ } else {
+ rt_cache_flush(-1);
+ }
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *dev = ptr;
+ struct in_device *in_dev = __in_dev_get(dev);
+
+ if (event == NETDEV_UNREGISTER) {
+ fib_disable_ip(dev, 2);
+ return NOTIFY_DONE;
+ }
+
+ if (!in_dev)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_UP:
+ for_ifa(in_dev) {
+ fib_add_ifaddr(ifa);
+ } endfor_ifa(in_dev);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ fib_sync_up(dev);
+#endif
+ rt_cache_flush(-1);
+ break;
+ case NETDEV_DOWN:
+ fib_disable_ip(dev, 0);
+ break;
+ case NETDEV_CHANGEMTU:
+ case NETDEV_CHANGE:
+ rt_cache_flush(0);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block fib_inetaddr_notifier = {
+ .notifier_call =fib_inetaddr_event,
+};
+
+static struct notifier_block fib_netdev_notifier = {
+ .notifier_call =fib_netdev_event,
+};
+
+void __init ip_fib_init(void)
+{
+#ifndef CONFIG_IP_MULTIPLE_TABLES
+ ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL);
+ ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN);
+#else
+ fib_rules_init();
+#endif
+
+ register_netdevice_notifier(&fib_netdev_notifier);
+ register_inetaddr_notifier(&fib_inetaddr_notifier);
+}
+
+EXPORT_SYMBOL(inet_addr_type);
+EXPORT_SYMBOL(ip_dev_find);
+EXPORT_SYMBOL(ip_rt_ioctl);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
new file mode 100644
index 000000000000..6506dcc01b46
--- /dev/null
+++ b/net/ipv4/fib_hash.c
@@ -0,0 +1,1086 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 FIB: lookup engine and maintenance routines.
+ *
+ * Version: $Id: fib_hash.c,v 1.13 2001/10/31 21:55:54 davem Exp $
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+
+#include "fib_lookup.h"
+
+static kmem_cache_t *fn_hash_kmem;
+static kmem_cache_t *fn_alias_kmem;
+
+struct fib_node {
+ struct hlist_node fn_hash;
+ struct list_head fn_alias;
+ u32 fn_key;
+};
+
+struct fn_zone {
+ struct fn_zone *fz_next; /* Next not empty zone */
+ struct hlist_head *fz_hash; /* Hash table pointer */
+ int fz_nent; /* Number of entries */
+
+ int fz_divisor; /* Hash divisor */
+ u32 fz_hashmask; /* (fz_divisor - 1) */
+#define FZ_HASHMASK(fz) ((fz)->fz_hashmask)
+
+ int fz_order; /* Zone order */
+ u32 fz_mask;
+#define FZ_MASK(fz) ((fz)->fz_mask)
+};
+
+/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask
+ * can be cheaper than memory lookup, so that FZ_* macros are used.
+ */
+
+struct fn_hash {
+ struct fn_zone *fn_zones[33];
+ struct fn_zone *fn_zone_list;
+};
+
+static inline u32 fn_hash(u32 key, struct fn_zone *fz)
+{
+ u32 h = ntohl(key)>>(32 - fz->fz_order);
+ h ^= (h>>20);
+ h ^= (h>>10);
+ h ^= (h>>5);
+ h &= FZ_HASHMASK(fz);
+ return h;
+}
+
+static inline u32 fz_key(u32 dst, struct fn_zone *fz)
+{
+ return dst & FZ_MASK(fz);
+}
+
+static DEFINE_RWLOCK(fib_hash_lock);
+static unsigned int fib_hash_genid;
+
+#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
+
+static struct hlist_head *fz_hash_alloc(int divisor)
+{
+ unsigned long size = divisor * sizeof(struct hlist_head);
+
+ if (size <= PAGE_SIZE) {
+ return kmalloc(size, GFP_KERNEL);
+ } else {
+ return (struct hlist_head *)
+ __get_free_pages(GFP_KERNEL, get_order(size));
+ }
+}
+
+/* The fib hash lock must be held when this is called. */
+static inline void fn_rebuild_zone(struct fn_zone *fz,
+ struct hlist_head *old_ht,
+ int old_divisor)
+{
+ int i;
+
+ for (i = 0; i < old_divisor; i++) {
+ struct hlist_node *node, *n;
+ struct fib_node *f;
+
+ hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
+ struct hlist_head *new_head;
+
+ hlist_del(&f->fn_hash);
+
+ new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
+ hlist_add_head(&f->fn_hash, new_head);
+ }
+ }
+}
+
+static void fz_hash_free(struct hlist_head *hash, int divisor)
+{
+ unsigned long size = divisor * sizeof(struct hlist_head);
+
+ if (size <= PAGE_SIZE)
+ kfree(hash);
+ else
+ free_pages((unsigned long)hash, get_order(size));
+}
+
+static void fn_rehash_zone(struct fn_zone *fz)
+{
+ struct hlist_head *ht, *old_ht;
+ int old_divisor, new_divisor;
+ u32 new_hashmask;
+
+ old_divisor = fz->fz_divisor;
+
+ switch (old_divisor) {
+ case 16:
+ new_divisor = 256;
+ break;
+ case 256:
+ new_divisor = 1024;
+ break;
+ default:
+ if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
+ printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor);
+ return;
+ }
+ new_divisor = (old_divisor << 1);
+ break;
+ }
+
+ new_hashmask = (new_divisor - 1);
+
+#if RT_CACHE_DEBUG >= 2
+ printk("fn_rehash_zone: hash for zone %d grows from %d\n", fz->fz_order, old_divisor);
+#endif
+
+ ht = fz_hash_alloc(new_divisor);
+
+ if (ht) {
+ memset(ht, 0, new_divisor * sizeof(struct hlist_head));
+
+ write_lock_bh(&fib_hash_lock);
+ old_ht = fz->fz_hash;
+ fz->fz_hash = ht;
+ fz->fz_hashmask = new_hashmask;
+ fz->fz_divisor = new_divisor;
+ fn_rebuild_zone(fz, old_ht, old_divisor);
+ fib_hash_genid++;
+ write_unlock_bh(&fib_hash_lock);
+
+ fz_hash_free(old_ht, old_divisor);
+ }
+}
+
+static inline void fn_free_node(struct fib_node * f)
+{
+ kmem_cache_free(fn_hash_kmem, f);
+}
+
+static inline void fn_free_alias(struct fib_alias *fa)
+{
+ fib_release_info(fa->fa_info);
+ kmem_cache_free(fn_alias_kmem, fa);
+}
+
+static struct fn_zone *
+fn_new_zone(struct fn_hash *table, int z)
+{
+ int i;
+ struct fn_zone *fz = kmalloc(sizeof(struct fn_zone), GFP_KERNEL);
+ if (!fz)
+ return NULL;
+
+ memset(fz, 0, sizeof(struct fn_zone));
+ if (z) {
+ fz->fz_divisor = 16;
+ } else {
+ fz->fz_divisor = 1;
+ }
+ fz->fz_hashmask = (fz->fz_divisor - 1);
+ fz->fz_hash = fz_hash_alloc(fz->fz_divisor);
+ if (!fz->fz_hash) {
+ kfree(fz);
+ return NULL;
+ }
+ memset(fz->fz_hash, 0, fz->fz_divisor * sizeof(struct hlist_head *));
+ fz->fz_order = z;
+ fz->fz_mask = inet_make_mask(z);
+
+ /* Find the first not empty zone with more specific mask */
+ for (i=z+1; i<=32; i++)
+ if (table->fn_zones[i])
+ break;
+ write_lock_bh(&fib_hash_lock);
+ if (i>32) {
+ /* No more specific masks, we are the first. */
+ fz->fz_next = table->fn_zone_list;
+ table->fn_zone_list = fz;
+ } else {
+ fz->fz_next = table->fn_zones[i]->fz_next;
+ table->fn_zones[i]->fz_next = fz;
+ }
+ table->fn_zones[z] = fz;
+ fib_hash_genid++;
+ write_unlock_bh(&fib_hash_lock);
+ return fz;
+}
+
+static int
+fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+{
+ int err;
+ struct fn_zone *fz;
+ struct fn_hash *t = (struct fn_hash*)tb->tb_data;
+
+ read_lock(&fib_hash_lock);
+ for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct fib_node *f;
+ u32 k = fz_key(flp->fl4_dst, fz);
+
+ head = &fz->fz_hash[fn_hash(k, fz)];
+ hlist_for_each_entry(f, node, head, fn_hash) {
+ if (f->fn_key != k)
+ continue;
+
+ err = fib_semantic_match(&f->fn_alias,
+ flp, res,
+ f->fn_key, fz->fz_mask,
+ fz->fz_order);
+ if (err <= 0)
+ goto out;
+ }
+ }
+ err = 1;
+out:
+ read_unlock(&fib_hash_lock);
+ return err;
+}
+
+static int fn_hash_last_dflt=-1;
+
+static void
+fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+{
+ int order, last_idx;
+ struct hlist_node *node;
+ struct fib_node *f;
+ struct fib_info *fi = NULL;
+ struct fib_info *last_resort;
+ struct fn_hash *t = (struct fn_hash*)tb->tb_data;
+ struct fn_zone *fz = t->fn_zones[0];
+
+ if (fz == NULL)
+ return;
+
+ last_idx = -1;
+ last_resort = NULL;
+ order = -1;
+
+ read_lock(&fib_hash_lock);
+ hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) {
+ struct fib_alias *fa;
+
+ list_for_each_entry(fa, &f->fn_alias, fa_list) {
+ struct fib_info *next_fi = fa->fa_info;
+
+ if (fa->fa_scope != res->scope ||
+ fa->fa_type != RTN_UNICAST)
+ continue;
+
+ if (next_fi->fib_priority > res->fi->fib_priority)
+ break;
+ if (!next_fi->fib_nh[0].nh_gw ||
+ next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
+ continue;
+ fa->fa_state |= FA_S_ACCESSED;
+
+ if (fi == NULL) {
+ if (next_fi != res->fi)
+ break;
+ } else if (!fib_detect_death(fi, order, &last_resort,
+ &last_idx, &fn_hash_last_dflt)) {
+ if (res->fi)
+ fib_info_put(res->fi);
+ res->fi = fi;
+ atomic_inc(&fi->fib_clntref);
+ fn_hash_last_dflt = order;
+ goto out;
+ }
+ fi = next_fi;
+ order++;
+ }
+ }
+
+ if (order <= 0 || fi == NULL) {
+ fn_hash_last_dflt = -1;
+ goto out;
+ }
+
+ if (!fib_detect_death(fi, order, &last_resort, &last_idx, &fn_hash_last_dflt)) {
+ if (res->fi)
+ fib_info_put(res->fi);
+ res->fi = fi;
+ atomic_inc(&fi->fib_clntref);
+ fn_hash_last_dflt = order;
+ goto out;
+ }
+
+ if (last_idx >= 0) {
+ if (res->fi)
+ fib_info_put(res->fi);
+ res->fi = last_resort;
+ if (last_resort)
+ atomic_inc(&last_resort->fib_clntref);
+ }
+ fn_hash_last_dflt = last_idx;
+out:
+ read_unlock(&fib_hash_lock);
+}
+
+/* Insert node F to FZ. */
+static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
+{
+ struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
+
+ hlist_add_head(&f->fn_hash, head);
+}
+
+/* Return the node in FZ matching KEY. */
+static struct fib_node *fib_find_node(struct fn_zone *fz, u32 key)
+{
+ struct hlist_head *head = &fz->fz_hash[fn_hash(key, fz)];
+ struct hlist_node *node;
+ struct fib_node *f;
+
+ hlist_for_each_entry(f, node, head, fn_hash) {
+ if (f->fn_key == key)
+ return f;
+ }
+
+ return NULL;
+}
+
+static int
+fn_hash_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
+ struct nlmsghdr *n, struct netlink_skb_parms *req)
+{
+ struct fn_hash *table = (struct fn_hash *) tb->tb_data;
+ struct fib_node *new_f, *f;
+ struct fib_alias *fa, *new_fa;
+ struct fn_zone *fz;
+ struct fib_info *fi;
+ int z = r->rtm_dst_len;
+ int type = r->rtm_type;
+ u8 tos = r->rtm_tos;
+ u32 key;
+ int err;
+
+ if (z > 32)
+ return -EINVAL;
+ fz = table->fn_zones[z];
+ if (!fz && !(fz = fn_new_zone(table, z)))
+ return -ENOBUFS;
+
+ key = 0;
+ if (rta->rta_dst) {
+ u32 dst;
+ memcpy(&dst, rta->rta_dst, 4);
+ if (dst & ~FZ_MASK(fz))
+ return -EINVAL;
+ key = fz_key(dst, fz);
+ }
+
+ if ((fi = fib_create_info(r, rta, n, &err)) == NULL)
+ return err;
+
+ if (fz->fz_nent > (fz->fz_divisor<<1) &&
+ fz->fz_divisor < FZ_MAX_DIVISOR &&
+ (z==32 || (1<<z) > fz->fz_divisor))
+ fn_rehash_zone(fz);
+
+ f = fib_find_node(fz, key);
+
+ if (!f)
+ fa = NULL;
+ else
+ fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority);
+
+ /* Now fa, if non-NULL, points to the first fib alias
+ * with the same keys [prefix,tos,priority], if such key already
+ * exists or to the node before which we will insert new one.
+ *
+ * If fa is NULL, we will need to allocate a new one and
+ * insert to the head of f.
+ *
+ * If f is NULL, no fib node matched the destination key
+ * and we need to allocate a new one of those as well.
+ */
+
+ if (fa && fa->fa_tos == tos &&
+ fa->fa_info->fib_priority == fi->fib_priority) {
+ struct fib_alias *fa_orig;
+
+ err = -EEXIST;
+ if (n->nlmsg_flags & NLM_F_EXCL)
+ goto out;
+
+ if (n->nlmsg_flags & NLM_F_REPLACE) {
+ struct fib_info *fi_drop;
+ u8 state;
+
+ write_lock_bh(&fib_hash_lock);
+ fi_drop = fa->fa_info;
+ fa->fa_info = fi;
+ fa->fa_type = type;
+ fa->fa_scope = r->rtm_scope;
+ state = fa->fa_state;
+ fa->fa_state &= ~FA_S_ACCESSED;
+ fib_hash_genid++;
+ write_unlock_bh(&fib_hash_lock);
+
+ fib_release_info(fi_drop);
+ if (state & FA_S_ACCESSED)
+ rt_cache_flush(-1);
+ return 0;
+ }
+
+ /* Error if we find a perfect match which
+ * uses the same scope, type, and nexthop
+ * information.
+ */
+ fa_orig = fa;
+ fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
+ list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
+ if (fa->fa_tos != tos)
+ break;
+ if (fa->fa_info->fib_priority != fi->fib_priority)
+ break;
+ if (fa->fa_type == type &&
+ fa->fa_scope == r->rtm_scope &&
+ fa->fa_info == fi)
+ goto out;
+ }
+ if (!(n->nlmsg_flags & NLM_F_APPEND))
+ fa = fa_orig;
+ }
+
+ err = -ENOENT;
+ if (!(n->nlmsg_flags&NLM_F_CREATE))
+ goto out;
+
+ err = -ENOBUFS;
+ new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
+ if (new_fa == NULL)
+ goto out;
+
+ new_f = NULL;
+ if (!f) {
+ new_f = kmem_cache_alloc(fn_hash_kmem, SLAB_KERNEL);
+ if (new_f == NULL)
+ goto out_free_new_fa;
+
+ INIT_HLIST_NODE(&new_f->fn_hash);
+ INIT_LIST_HEAD(&new_f->fn_alias);
+ new_f->fn_key = key;
+ f = new_f;
+ }
+
+ new_fa->fa_info = fi;
+ new_fa->fa_tos = tos;
+ new_fa->fa_type = type;
+ new_fa->fa_scope = r->rtm_scope;
+ new_fa->fa_state = 0;
+
+ /*
+ * Insert new entry to the list.
+ */
+
+ write_lock_bh(&fib_hash_lock);
+ if (new_f)
+ fib_insert_node(fz, new_f);
+ list_add_tail(&new_fa->fa_list,
+ (fa ? &fa->fa_list : &f->fn_alias));
+ fib_hash_genid++;
+ write_unlock_bh(&fib_hash_lock);
+
+ if (new_f)
+ fz->fz_nent++;
+ rt_cache_flush(-1);
+
+ rtmsg_fib(RTM_NEWROUTE, key, new_fa, z, tb->tb_id, n, req);
+ return 0;
+
+out_free_new_fa:
+ kmem_cache_free(fn_alias_kmem, new_fa);
+out:
+ fib_release_info(fi);
+ return err;
+}
+
+
+static int
+fn_hash_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
+ struct nlmsghdr *n, struct netlink_skb_parms *req)
+{
+ struct fn_hash *table = (struct fn_hash*)tb->tb_data;
+ struct fib_node *f;
+ struct fib_alias *fa, *fa_to_delete;
+ int z = r->rtm_dst_len;
+ struct fn_zone *fz;
+ u32 key;
+ u8 tos = r->rtm_tos;
+
+ if (z > 32)
+ return -EINVAL;
+ if ((fz = table->fn_zones[z]) == NULL)
+ return -ESRCH;
+
+ key = 0;
+ if (rta->rta_dst) {
+ u32 dst;
+ memcpy(&dst, rta->rta_dst, 4);
+ if (dst & ~FZ_MASK(fz))
+ return -EINVAL;
+ key = fz_key(dst, fz);
+ }
+
+ f = fib_find_node(fz, key);
+
+ if (!f)
+ fa = NULL;
+ else
+ fa = fib_find_alias(&f->fn_alias, tos, 0);
+ if (!fa)
+ return -ESRCH;
+
+ fa_to_delete = NULL;
+ fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
+ list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (fa->fa_tos != tos)
+ break;
+
+ if ((!r->rtm_type ||
+ fa->fa_type == r->rtm_type) &&
+ (r->rtm_scope == RT_SCOPE_NOWHERE ||
+ fa->fa_scope == r->rtm_scope) &&
+ (!r->rtm_protocol ||
+ fi->fib_protocol == r->rtm_protocol) &&
+ fib_nh_match(r, n, rta, fi) == 0) {
+ fa_to_delete = fa;
+ break;
+ }
+ }
+
+ if (fa_to_delete) {
+ int kill_fn;
+
+ fa = fa_to_delete;
+ rtmsg_fib(RTM_DELROUTE, key, fa, z, tb->tb_id, n, req);
+
+ kill_fn = 0;
+ write_lock_bh(&fib_hash_lock);
+ list_del(&fa->fa_list);
+ if (list_empty(&f->fn_alias)) {
+ hlist_del(&f->fn_hash);
+ kill_fn = 1;
+ }
+ fib_hash_genid++;
+ write_unlock_bh(&fib_hash_lock);
+
+ if (fa->fa_state & FA_S_ACCESSED)
+ rt_cache_flush(-1);
+ fn_free_alias(fa);
+ if (kill_fn) {
+ fn_free_node(f);
+ fz->fz_nent--;
+ }
+
+ return 0;
+ }
+ return -ESRCH;
+}
+
+static int fn_flush_list(struct fn_zone *fz, int idx)
+{
+ struct hlist_head *head = &fz->fz_hash[idx];
+ struct hlist_node *node, *n;
+ struct fib_node *f;
+ int found = 0;
+
+ hlist_for_each_entry_safe(f, node, n, head, fn_hash) {
+ struct fib_alias *fa, *fa_node;
+ int kill_f;
+
+ kill_f = 0;
+ list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
+ write_lock_bh(&fib_hash_lock);
+ list_del(&fa->fa_list);
+ if (list_empty(&f->fn_alias)) {
+ hlist_del(&f->fn_hash);
+ kill_f = 1;
+ }
+ fib_hash_genid++;
+ write_unlock_bh(&fib_hash_lock);
+
+ fn_free_alias(fa);
+ found++;
+ }
+ }
+ if (kill_f) {
+ fn_free_node(f);
+ fz->fz_nent--;
+ }
+ }
+ return found;
+}
+
+static int fn_hash_flush(struct fib_table *tb)
+{
+ struct fn_hash *table = (struct fn_hash *) tb->tb_data;
+ struct fn_zone *fz;
+ int found = 0;
+
+ for (fz = table->fn_zone_list; fz; fz = fz->fz_next) {
+ int i;
+
+ for (i = fz->fz_divisor - 1; i >= 0; i--)
+ found += fn_flush_list(fz, i);
+ }
+ return found;
+}
+
+
+static inline int
+fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
+ struct fib_table *tb,
+ struct fn_zone *fz,
+ struct hlist_head *head)
+{
+ struct hlist_node *node;
+ struct fib_node *f;
+ int i, s_i;
+
+ s_i = cb->args[3];
+ i = 0;
+ hlist_for_each_entry(f, node, head, fn_hash) {
+ struct fib_alias *fa;
+
+ list_for_each_entry(fa, &f->fn_alias, fa_list) {
+ if (i < s_i)
+ goto next;
+
+ if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWROUTE,
+ tb->tb_id,
+ fa->fa_type,
+ fa->fa_scope,
+ &f->fn_key,
+ fz->fz_order,
+ fa->fa_tos,
+ fa->fa_info) < 0) {
+ cb->args[3] = i;
+ return -1;
+ }
+ next:
+ i++;
+ }
+ }
+ cb->args[3] = i;
+ return skb->len;
+}
+
+static inline int
+fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
+ struct fib_table *tb,
+ struct fn_zone *fz)
+{
+ int h, s_h;
+
+ s_h = cb->args[2];
+ for (h=0; h < fz->fz_divisor; h++) {
+ if (h < s_h) continue;
+ if (h > s_h)
+ memset(&cb->args[3], 0,
+ sizeof(cb->args) - 3*sizeof(cb->args[0]));
+ if (fz->fz_hash == NULL ||
+ hlist_empty(&fz->fz_hash[h]))
+ continue;
+ if (fn_hash_dump_bucket(skb, cb, tb, fz, &fz->fz_hash[h])<0) {
+ cb->args[2] = h;
+ return -1;
+ }
+ }
+ cb->args[2] = h;
+ return skb->len;
+}
+
+static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int m, s_m;
+ struct fn_zone *fz;
+ struct fn_hash *table = (struct fn_hash*)tb->tb_data;
+
+ s_m = cb->args[1];
+ read_lock(&fib_hash_lock);
+ for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) {
+ if (m < s_m) continue;
+ if (m > s_m)
+ memset(&cb->args[2], 0,
+ sizeof(cb->args) - 2*sizeof(cb->args[0]));
+ if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
+ cb->args[1] = m;
+ read_unlock(&fib_hash_lock);
+ return -1;
+ }
+ }
+ read_unlock(&fib_hash_lock);
+ cb->args[1] = m;
+ return skb->len;
+}
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+struct fib_table * fib_hash_init(int id)
+#else
+struct fib_table * __init fib_hash_init(int id)
+#endif
+{
+ struct fib_table *tb;
+
+ if (fn_hash_kmem == NULL)
+ fn_hash_kmem = kmem_cache_create("ip_fib_hash",
+ sizeof(struct fib_node),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+
+ if (fn_alias_kmem == NULL)
+ fn_alias_kmem = kmem_cache_create("ip_fib_alias",
+ sizeof(struct fib_alias),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+
+ tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
+ GFP_KERNEL);
+ if (tb == NULL)
+ return NULL;
+
+ tb->tb_id = id;
+ tb->tb_lookup = fn_hash_lookup;
+ tb->tb_insert = fn_hash_insert;
+ tb->tb_delete = fn_hash_delete;
+ tb->tb_flush = fn_hash_flush;
+ tb->tb_select_default = fn_hash_select_default;
+ tb->tb_dump = fn_hash_dump;
+ memset(tb->tb_data, 0, sizeof(struct fn_hash));
+ return tb;
+}
+
+/* ------------------------------------------------------------------------ */
+#ifdef CONFIG_PROC_FS
+
+struct fib_iter_state {
+ struct fn_zone *zone;
+ int bucket;
+ struct hlist_head *hash_head;
+ struct fib_node *fn;
+ struct fib_alias *fa;
+ loff_t pos;
+ unsigned int genid;
+ int valid;
+};
+
+static struct fib_alias *fib_get_first(struct seq_file *seq)
+{
+ struct fib_iter_state *iter = seq->private;
+ struct fn_hash *table = (struct fn_hash *) ip_fib_main_table->tb_data;
+
+ iter->bucket = 0;
+ iter->hash_head = NULL;
+ iter->fn = NULL;
+ iter->fa = NULL;
+ iter->pos = 0;
+ iter->genid = fib_hash_genid;
+ iter->valid = 1;
+
+ for (iter->zone = table->fn_zone_list; iter->zone;
+ iter->zone = iter->zone->fz_next) {
+ int maxslot;
+
+ if (!iter->zone->fz_nent)
+ continue;
+
+ iter->hash_head = iter->zone->fz_hash;
+ maxslot = iter->zone->fz_divisor;
+
+ for (iter->bucket = 0; iter->bucket < maxslot;
+ ++iter->bucket, ++iter->hash_head) {
+ struct hlist_node *node;
+ struct fib_node *fn;
+
+ hlist_for_each_entry(fn,node,iter->hash_head,fn_hash) {
+ struct fib_alias *fa;
+
+ list_for_each_entry(fa,&fn->fn_alias,fa_list) {
+ iter->fn = fn;
+ iter->fa = fa;
+ goto out;
+ }
+ }
+ }
+ }
+out:
+ return iter->fa;
+}
+
+static struct fib_alias *fib_get_next(struct seq_file *seq)
+{
+ struct fib_iter_state *iter = seq->private;
+ struct fib_node *fn;
+ struct fib_alias *fa;
+
+ /* Advance FA, if any. */
+ fn = iter->fn;
+ fa = iter->fa;
+ if (fa) {
+ BUG_ON(!fn);
+ list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) {
+ iter->fa = fa;
+ goto out;
+ }
+ }
+
+ fa = iter->fa = NULL;
+
+ /* Advance FN. */
+ if (fn) {
+ struct hlist_node *node = &fn->fn_hash;
+ hlist_for_each_entry_continue(fn, node, fn_hash) {
+ iter->fn = fn;
+
+ list_for_each_entry(fa, &fn->fn_alias, fa_list) {
+ iter->fa = fa;
+ goto out;
+ }
+ }
+ }
+
+ fn = iter->fn = NULL;
+
+ /* Advance hash chain. */
+ if (!iter->zone)
+ goto out;
+
+ for (;;) {
+ struct hlist_node *node;
+ int maxslot;
+
+ maxslot = iter->zone->fz_divisor;
+
+ while (++iter->bucket < maxslot) {
+ iter->hash_head++;
+
+ hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
+ list_for_each_entry(fa, &fn->fn_alias, fa_list) {
+ iter->fn = fn;
+ iter->fa = fa;
+ goto out;
+ }
+ }
+ }
+
+ iter->zone = iter->zone->fz_next;
+
+ if (!iter->zone)
+ goto out;
+
+ iter->bucket = 0;
+ iter->hash_head = iter->zone->fz_hash;
+
+ hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
+ list_for_each_entry(fa, &fn->fn_alias, fa_list) {
+ iter->fn = fn;
+ iter->fa = fa;
+ goto out;
+ }
+ }
+ }
+out:
+ iter->pos++;
+ return fa;
+}
+
+static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct fib_iter_state *iter = seq->private;
+ struct fib_alias *fa;
+
+ if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) {
+ fa = iter->fa;
+ pos -= iter->pos;
+ } else
+ fa = fib_get_first(seq);
+
+ if (fa)
+ while (pos && (fa = fib_get_next(seq)))
+ --pos;
+ return pos ? NULL : fa;
+}
+
+static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ void *v = NULL;
+
+ read_lock(&fib_hash_lock);
+ if (ip_fib_main_table)
+ v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+ return v;
+}
+
+static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq);
+}
+
+static void fib_seq_stop(struct seq_file *seq, void *v)
+{
+ read_unlock(&fib_hash_lock);
+}
+
+static unsigned fib_flag_trans(int type, u32 mask, struct fib_info *fi)
+{
+ static unsigned type2flags[RTN_MAX + 1] = {
+ [7] = RTF_REJECT, [8] = RTF_REJECT,
+ };
+ unsigned flags = type2flags[type];
+
+ if (fi && fi->fib_nh->nh_gw)
+ flags |= RTF_GATEWAY;
+ if (mask == 0xFFFFFFFF)
+ flags |= RTF_HOST;
+ flags |= RTF_UP;
+ return flags;
+}
+
+/*
+ * This outputs /proc/net/route.
+ *
+ * It always works in backward compatibility mode.
+ * The format of the file is not supposed to be changed.
+ */
+static int fib_seq_show(struct seq_file *seq, void *v)
+{
+ struct fib_iter_state *iter;
+ char bf[128];
+ u32 prefix, mask;
+ unsigned flags;
+ struct fib_node *f;
+ struct fib_alias *fa;
+ struct fib_info *fi;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
+ "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
+ "\tWindow\tIRTT");
+ goto out;
+ }
+
+ iter = seq->private;
+ f = iter->fn;
+ fa = iter->fa;
+ fi = fa->fa_info;
+ prefix = f->fn_key;
+ mask = FZ_MASK(iter->zone);
+ flags = fib_flag_trans(fa->fa_type, mask, fi);
+ if (fi)
+ snprintf(bf, sizeof(bf),
+ "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
+ fi->fib_dev ? fi->fib_dev->name : "*", prefix,
+ fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority,
+ mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
+ fi->fib_window,
+ fi->fib_rtt >> 3);
+ else
+ snprintf(bf, sizeof(bf),
+ "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
+ prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0);
+ seq_printf(seq, "%-127s\n", bf);
+out:
+ return 0;
+}
+
+static struct seq_operations fib_seq_ops = {
+ .start = fib_seq_start,
+ .next = fib_seq_next,
+ .stop = fib_seq_stop,
+ .show = fib_seq_show,
+};
+
+static int fib_seq_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int rc = -ENOMEM;
+ struct fib_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (!s)
+ goto out;
+
+ rc = seq_open(file, &fib_seq_ops);
+ if (rc)
+ goto out_kfree;
+
+ seq = file->private_data;
+ seq->private = s;
+ memset(s, 0, sizeof(*s));
+out:
+ return rc;
+out_kfree:
+ kfree(s);
+ goto out;
+}
+
+static struct file_operations fib_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = fib_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+int __init fib_proc_init(void)
+{
+ if (!proc_net_fops_create("route", S_IRUGO, &fib_seq_fops))
+ return -ENOMEM;
+ return 0;
+}
+
+void __init fib_proc_exit(void)
+{
+ proc_net_remove("route");
+}
+#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
new file mode 100644
index 000000000000..ac4485f75e97
--- /dev/null
+++ b/net/ipv4/fib_lookup.h
@@ -0,0 +1,43 @@
+#ifndef _FIB_LOOKUP_H
+#define _FIB_LOOKUP_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <net/ip_fib.h>
+
+struct fib_alias {
+ struct list_head fa_list;
+ struct fib_info *fa_info;
+ u8 fa_tos;
+ u8 fa_type;
+ u8 fa_scope;
+ u8 fa_state;
+};
+
+#define FA_S_ACCESSED 0x01
+
+/* Exported by fib_semantics.c */
+extern int fib_semantic_match(struct list_head *head,
+ const struct flowi *flp,
+ struct fib_result *res, __u32 zone, __u32 mask,
+ int prefixlen);
+extern void fib_release_info(struct fib_info *);
+extern struct fib_info *fib_create_info(const struct rtmsg *r,
+ struct kern_rta *rta,
+ const struct nlmsghdr *,
+ int *err);
+extern int fib_nh_match(struct rtmsg *r, struct nlmsghdr *,
+ struct kern_rta *rta, struct fib_info *fi);
+extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+ u8 tb_id, u8 type, u8 scope, void *dst,
+ int dst_len, u8 tos, struct fib_info *fi);
+extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
+ int z, int tb_id,
+ struct nlmsghdr *n, struct netlink_skb_parms *req);
+extern struct fib_alias *fib_find_alias(struct list_head *fah,
+ u8 tos, u32 prio);
+extern int fib_detect_death(struct fib_info *fi, int order,
+ struct fib_info **last_resort,
+ int *last_idx, int *dflt);
+
+#endif /* _FIB_LOOKUP_H */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
new file mode 100644
index 000000000000..39d0aadb9a2a
--- /dev/null
+++ b/net/ipv4/fib_rules.c
@@ -0,0 +1,437 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 Forwarding Information Base: policy rules.
+ *
+ * Version: $Id: fib_rules.c,v 1.17 2001/10/31 21:55:54 davem Exp $
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Rani Assaf : local_rule cannot be deleted
+ * Marc Boucher : routing by fwmark
+ */
+
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+
+#define FRprintk(a...)
+
+struct fib_rule
+{
+ struct fib_rule *r_next;
+ atomic_t r_clntref;
+ u32 r_preference;
+ unsigned char r_table;
+ unsigned char r_action;
+ unsigned char r_dst_len;
+ unsigned char r_src_len;
+ u32 r_src;
+ u32 r_srcmask;
+ u32 r_dst;
+ u32 r_dstmask;
+ u32 r_srcmap;
+ u8 r_flags;
+ u8 r_tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ u32 r_fwmark;
+#endif
+ int r_ifindex;
+#ifdef CONFIG_NET_CLS_ROUTE
+ __u32 r_tclassid;
+#endif
+ char r_ifname[IFNAMSIZ];
+ int r_dead;
+};
+
+static struct fib_rule default_rule = {
+ .r_clntref = ATOMIC_INIT(2),
+ .r_preference = 0x7FFF,
+ .r_table = RT_TABLE_DEFAULT,
+ .r_action = RTN_UNICAST,
+};
+
+static struct fib_rule main_rule = {
+ .r_next = &default_rule,
+ .r_clntref = ATOMIC_INIT(2),
+ .r_preference = 0x7FFE,
+ .r_table = RT_TABLE_MAIN,
+ .r_action = RTN_UNICAST,
+};
+
+static struct fib_rule local_rule = {
+ .r_next = &main_rule,
+ .r_clntref = ATOMIC_INIT(2),
+ .r_table = RT_TABLE_LOCAL,
+ .r_action = RTN_UNICAST,
+};
+
+static struct fib_rule *fib_rules = &local_rule;
+static DEFINE_RWLOCK(fib_rules_lock);
+
+int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+ struct rtattr **rta = arg;
+ struct rtmsg *rtm = NLMSG_DATA(nlh);
+ struct fib_rule *r, **rp;
+ int err = -ESRCH;
+
+ for (rp=&fib_rules; (r=*rp) != NULL; rp=&r->r_next) {
+ if ((!rta[RTA_SRC-1] || memcmp(RTA_DATA(rta[RTA_SRC-1]), &r->r_src, 4) == 0) &&
+ rtm->rtm_src_len == r->r_src_len &&
+ rtm->rtm_dst_len == r->r_dst_len &&
+ (!rta[RTA_DST-1] || memcmp(RTA_DATA(rta[RTA_DST-1]), &r->r_dst, 4) == 0) &&
+ rtm->rtm_tos == r->r_tos &&
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ (!rta[RTA_PROTOINFO-1] || memcmp(RTA_DATA(rta[RTA_PROTOINFO-1]), &r->r_fwmark, 4) == 0) &&
+#endif
+ (!rtm->rtm_type || rtm->rtm_type == r->r_action) &&
+ (!rta[RTA_PRIORITY-1] || memcmp(RTA_DATA(rta[RTA_PRIORITY-1]), &r->r_preference, 4) == 0) &&
+ (!rta[RTA_IIF-1] || rtattr_strcmp(rta[RTA_IIF-1], r->r_ifname) == 0) &&
+ (!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) {
+ err = -EPERM;
+ if (r == &local_rule)
+ break;
+
+ write_lock_bh(&fib_rules_lock);
+ *rp = r->r_next;
+ r->r_dead = 1;
+ write_unlock_bh(&fib_rules_lock);
+ fib_rule_put(r);
+ err = 0;
+ break;
+ }
+ }
+ return err;
+}
+
+/* Allocate new unique table id */
+
+static struct fib_table *fib_empty_table(void)
+{
+ int id;
+
+ for (id = 1; id <= RT_TABLE_MAX; id++)
+ if (fib_tables[id] == NULL)
+ return __fib_new_table(id);
+ return NULL;
+}
+
+void fib_rule_put(struct fib_rule *r)
+{
+ if (atomic_dec_and_test(&r->r_clntref)) {
+ if (r->r_dead)
+ kfree(r);
+ else
+ printk("Freeing alive rule %p\n", r);
+ }
+}
+
+int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+ struct rtattr **rta = arg;
+ struct rtmsg *rtm = NLMSG_DATA(nlh);
+ struct fib_rule *r, *new_r, **rp;
+ unsigned char table_id;
+
+ if (rtm->rtm_src_len > 32 || rtm->rtm_dst_len > 32 ||
+ (rtm->rtm_tos & ~IPTOS_TOS_MASK))
+ return -EINVAL;
+
+ if (rta[RTA_IIF-1] && RTA_PAYLOAD(rta[RTA_IIF-1]) > IFNAMSIZ)
+ return -EINVAL;
+
+ table_id = rtm->rtm_table;
+ if (table_id == RT_TABLE_UNSPEC) {
+ struct fib_table *table;
+ if (rtm->rtm_type == RTN_UNICAST) {
+ if ((table = fib_empty_table()) == NULL)
+ return -ENOBUFS;
+ table_id = table->tb_id;
+ }
+ }
+
+ new_r = kmalloc(sizeof(*new_r), GFP_KERNEL);
+ if (!new_r)
+ return -ENOMEM;
+ memset(new_r, 0, sizeof(*new_r));
+ if (rta[RTA_SRC-1])
+ memcpy(&new_r->r_src, RTA_DATA(rta[RTA_SRC-1]), 4);
+ if (rta[RTA_DST-1])
+ memcpy(&new_r->r_dst, RTA_DATA(rta[RTA_DST-1]), 4);
+ if (rta[RTA_GATEWAY-1])
+ memcpy(&new_r->r_srcmap, RTA_DATA(rta[RTA_GATEWAY-1]), 4);
+ new_r->r_src_len = rtm->rtm_src_len;
+ new_r->r_dst_len = rtm->rtm_dst_len;
+ new_r->r_srcmask = inet_make_mask(rtm->rtm_src_len);
+ new_r->r_dstmask = inet_make_mask(rtm->rtm_dst_len);
+ new_r->r_tos = rtm->rtm_tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ if (rta[RTA_PROTOINFO-1])
+ memcpy(&new_r->r_fwmark, RTA_DATA(rta[RTA_PROTOINFO-1]), 4);
+#endif
+ new_r->r_action = rtm->rtm_type;
+ new_r->r_flags = rtm->rtm_flags;
+ if (rta[RTA_PRIORITY-1])
+ memcpy(&new_r->r_preference, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
+ new_r->r_table = table_id;
+ if (rta[RTA_IIF-1]) {
+ struct net_device *dev;
+ rtattr_strlcpy(new_r->r_ifname, rta[RTA_IIF-1], IFNAMSIZ);
+ new_r->r_ifindex = -1;
+ dev = __dev_get_by_name(new_r->r_ifname);
+ if (dev)
+ new_r->r_ifindex = dev->ifindex;
+ }
+#ifdef CONFIG_NET_CLS_ROUTE
+ if (rta[RTA_FLOW-1])
+ memcpy(&new_r->r_tclassid, RTA_DATA(rta[RTA_FLOW-1]), 4);
+#endif
+
+ rp = &fib_rules;
+ if (!new_r->r_preference) {
+ r = fib_rules;
+ if (r && (r = r->r_next) != NULL) {
+ rp = &fib_rules->r_next;
+ if (r->r_preference)
+ new_r->r_preference = r->r_preference - 1;
+ }
+ }
+
+ while ( (r = *rp) != NULL ) {
+ if (r->r_preference > new_r->r_preference)
+ break;
+ rp = &r->r_next;
+ }
+
+ new_r->r_next = r;
+ atomic_inc(&new_r->r_clntref);
+ write_lock_bh(&fib_rules_lock);
+ *rp = new_r;
+ write_unlock_bh(&fib_rules_lock);
+ return 0;
+}
+
+#ifdef CONFIG_NET_CLS_ROUTE
+u32 fib_rules_tclass(struct fib_result *res)
+{
+ if (res->r)
+ return res->r->r_tclassid;
+ return 0;
+}
+#endif
+
+
+static void fib_rules_detach(struct net_device *dev)
+{
+ struct fib_rule *r;
+
+ for (r=fib_rules; r; r=r->r_next) {
+ if (r->r_ifindex == dev->ifindex) {
+ write_lock_bh(&fib_rules_lock);
+ r->r_ifindex = -1;
+ write_unlock_bh(&fib_rules_lock);
+ }
+ }
+}
+
+static void fib_rules_attach(struct net_device *dev)
+{
+ struct fib_rule *r;
+
+ for (r=fib_rules; r; r=r->r_next) {
+ if (r->r_ifindex == -1 && strcmp(dev->name, r->r_ifname) == 0) {
+ write_lock_bh(&fib_rules_lock);
+ r->r_ifindex = dev->ifindex;
+ write_unlock_bh(&fib_rules_lock);
+ }
+ }
+}
+
+int fib_lookup(const struct flowi *flp, struct fib_result *res)
+{
+ int err;
+ struct fib_rule *r, *policy;
+ struct fib_table *tb;
+
+ u32 daddr = flp->fl4_dst;
+ u32 saddr = flp->fl4_src;
+
+FRprintk("Lookup: %u.%u.%u.%u <- %u.%u.%u.%u ",
+ NIPQUAD(flp->fl4_dst), NIPQUAD(flp->fl4_src));
+ read_lock(&fib_rules_lock);
+ for (r = fib_rules; r; r=r->r_next) {
+ if (((saddr^r->r_src) & r->r_srcmask) ||
+ ((daddr^r->r_dst) & r->r_dstmask) ||
+ (r->r_tos && r->r_tos != flp->fl4_tos) ||
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ (r->r_fwmark && r->r_fwmark != flp->fl4_fwmark) ||
+#endif
+ (r->r_ifindex && r->r_ifindex != flp->iif))
+ continue;
+
+FRprintk("tb %d r %d ", r->r_table, r->r_action);
+ switch (r->r_action) {
+ case RTN_UNICAST:
+ policy = r;
+ break;
+ case RTN_UNREACHABLE:
+ read_unlock(&fib_rules_lock);
+ return -ENETUNREACH;
+ default:
+ case RTN_BLACKHOLE:
+ read_unlock(&fib_rules_lock);
+ return -EINVAL;
+ case RTN_PROHIBIT:
+ read_unlock(&fib_rules_lock);
+ return -EACCES;
+ }
+
+ if ((tb = fib_get_table(r->r_table)) == NULL)
+ continue;
+ err = tb->tb_lookup(tb, flp, res);
+ if (err == 0) {
+ res->r = policy;
+ if (policy)
+ atomic_inc(&policy->r_clntref);
+ read_unlock(&fib_rules_lock);
+ return 0;
+ }
+ if (err < 0 && err != -EAGAIN) {
+ read_unlock(&fib_rules_lock);
+ return err;
+ }
+ }
+FRprintk("FAILURE\n");
+ read_unlock(&fib_rules_lock);
+ return -ENETUNREACH;
+}
+
+void fib_select_default(const struct flowi *flp, struct fib_result *res)
+{
+ if (res->r && res->r->r_action == RTN_UNICAST &&
+ FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) {
+ struct fib_table *tb;
+ if ((tb = fib_get_table(res->r->r_table)) != NULL)
+ tb->tb_select_default(tb, flp, res);
+ }
+}
+
+static int fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *dev = ptr;
+
+ if (event == NETDEV_UNREGISTER)
+ fib_rules_detach(dev);
+ else if (event == NETDEV_REGISTER)
+ fib_rules_attach(dev);
+ return NOTIFY_DONE;
+}
+
+
+static struct notifier_block fib_rules_notifier = {
+ .notifier_call =fib_rules_event,
+};
+
+static __inline__ int inet_fill_rule(struct sk_buff *skb,
+ struct fib_rule *r,
+ struct netlink_callback *cb)
+{
+ struct rtmsg *rtm;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb->tail;
+
+ nlh = NLMSG_PUT(skb, NETLINK_CREDS(cb->skb)->pid, cb->nlh->nlmsg_seq, RTM_NEWRULE, sizeof(*rtm));
+ rtm = NLMSG_DATA(nlh);
+ rtm->rtm_family = AF_INET;
+ rtm->rtm_dst_len = r->r_dst_len;
+ rtm->rtm_src_len = r->r_src_len;
+ rtm->rtm_tos = r->r_tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ if (r->r_fwmark)
+ RTA_PUT(skb, RTA_PROTOINFO, 4, &r->r_fwmark);
+#endif
+ rtm->rtm_table = r->r_table;
+ rtm->rtm_protocol = 0;
+ rtm->rtm_scope = 0;
+ rtm->rtm_type = r->r_action;
+ rtm->rtm_flags = r->r_flags;
+
+ if (r->r_dst_len)
+ RTA_PUT(skb, RTA_DST, 4, &r->r_dst);
+ if (r->r_src_len)
+ RTA_PUT(skb, RTA_SRC, 4, &r->r_src);
+ if (r->r_ifname[0])
+ RTA_PUT(skb, RTA_IIF, IFNAMSIZ, &r->r_ifname);
+ if (r->r_preference)
+ RTA_PUT(skb, RTA_PRIORITY, 4, &r->r_preference);
+ if (r->r_srcmap)
+ RTA_PUT(skb, RTA_GATEWAY, 4, &r->r_srcmap);
+#ifdef CONFIG_NET_CLS_ROUTE
+ if (r->r_tclassid)
+ RTA_PUT(skb, RTA_FLOW, 4, &r->r_tclassid);
+#endif
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+nlmsg_failure:
+rtattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int idx;
+ int s_idx = cb->args[0];
+ struct fib_rule *r;
+
+ read_lock(&fib_rules_lock);
+ for (r=fib_rules, idx=0; r; r = r->r_next, idx++) {
+ if (idx < s_idx)
+ continue;
+ if (inet_fill_rule(skb, r, cb) < 0)
+ break;
+ }
+ read_unlock(&fib_rules_lock);
+ cb->args[0] = idx;
+
+ return skb->len;
+}
+
+void __init fib_rules_init(void)
+{
+ register_netdevice_notifier(&fib_rules_notifier);
+}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
new file mode 100644
index 000000000000..029362d66135
--- /dev/null
+++ b/net/ipv4/fib_semantics.c
@@ -0,0 +1,1332 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 Forwarding Information Base: semantics.
+ *
+ * Version: $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#include <net/ip_mp_alg.h>
+
+#include "fib_lookup.h"
+
+#define FSprintk(a...)
+
+static DEFINE_RWLOCK(fib_info_lock);
+static struct hlist_head *fib_info_hash;
+static struct hlist_head *fib_info_laddrhash;
+static unsigned int fib_hash_size;
+static unsigned int fib_info_cnt;
+
+#define DEVINDEX_HASHBITS 8
+#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
+static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+static DEFINE_SPINLOCK(fib_multipath_lock);
+
+#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
+for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
+
+#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
+for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
+
+#else /* CONFIG_IP_ROUTE_MULTIPATH */
+
+/* Hope, that gcc will optimize it to get rid of dummy loop */
+
+#define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
+for (nhsel=0; nhsel < 1; nhsel++)
+
+#define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
+for (nhsel=0; nhsel < 1; nhsel++)
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
+#define endfor_nexthops(fi) }
+
+
+static struct
+{
+ int error;
+ u8 scope;
+} fib_props[RTA_MAX + 1] = {
+ {
+ .error = 0,
+ .scope = RT_SCOPE_NOWHERE,
+ }, /* RTN_UNSPEC */
+ {
+ .error = 0,
+ .scope = RT_SCOPE_UNIVERSE,
+ }, /* RTN_UNICAST */
+ {
+ .error = 0,
+ .scope = RT_SCOPE_HOST,
+ }, /* RTN_LOCAL */
+ {
+ .error = 0,
+ .scope = RT_SCOPE_LINK,
+ }, /* RTN_BROADCAST */
+ {
+ .error = 0,
+ .scope = RT_SCOPE_LINK,
+ }, /* RTN_ANYCAST */
+ {
+ .error = 0,
+ .scope = RT_SCOPE_UNIVERSE,
+ }, /* RTN_MULTICAST */
+ {
+ .error = -EINVAL,
+ .scope = RT_SCOPE_UNIVERSE,
+ }, /* RTN_BLACKHOLE */
+ {
+ .error = -EHOSTUNREACH,
+ .scope = RT_SCOPE_UNIVERSE,
+ }, /* RTN_UNREACHABLE */
+ {
+ .error = -EACCES,
+ .scope = RT_SCOPE_UNIVERSE,
+ }, /* RTN_PROHIBIT */
+ {
+ .error = -EAGAIN,
+ .scope = RT_SCOPE_UNIVERSE,
+ }, /* RTN_THROW */
+ {
+ .error = -EINVAL,
+ .scope = RT_SCOPE_NOWHERE,
+ }, /* RTN_NAT */
+ {
+ .error = -EINVAL,
+ .scope = RT_SCOPE_NOWHERE,
+ }, /* RTN_XRESOLVE */
+};
+
+
+/* Release a nexthop info record */
+
+void free_fib_info(struct fib_info *fi)
+{
+ if (fi->fib_dead == 0) {
+ printk("Freeing alive fib_info %p\n", fi);
+ return;
+ }
+ change_nexthops(fi) {
+ if (nh->nh_dev)
+ dev_put(nh->nh_dev);
+ nh->nh_dev = NULL;
+ } endfor_nexthops(fi);
+ fib_info_cnt--;
+ kfree(fi);
+}
+
+void fib_release_info(struct fib_info *fi)
+{
+ write_lock(&fib_info_lock);
+ if (fi && --fi->fib_treeref == 0) {
+ hlist_del(&fi->fib_hash);
+ if (fi->fib_prefsrc)
+ hlist_del(&fi->fib_lhash);
+ change_nexthops(fi) {
+ if (!nh->nh_dev)
+ continue;
+ hlist_del(&nh->nh_hash);
+ } endfor_nexthops(fi)
+ fi->fib_dead = 1;
+ fib_info_put(fi);
+ }
+ write_unlock(&fib_info_lock);
+}
+
+static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
+{
+ const struct fib_nh *onh = ofi->fib_nh;
+
+ for_nexthops(fi) {
+ if (nh->nh_oif != onh->nh_oif ||
+ nh->nh_gw != onh->nh_gw ||
+ nh->nh_scope != onh->nh_scope ||
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ nh->nh_weight != onh->nh_weight ||
+#endif
+#ifdef CONFIG_NET_CLS_ROUTE
+ nh->nh_tclassid != onh->nh_tclassid ||
+#endif
+ ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
+ return -1;
+ onh++;
+ } endfor_nexthops(fi);
+ return 0;
+}
+
+static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
+{
+ unsigned int mask = (fib_hash_size - 1);
+ unsigned int val = fi->fib_nhs;
+
+ val ^= fi->fib_protocol;
+ val ^= fi->fib_prefsrc;
+ val ^= fi->fib_priority;
+
+ return (val ^ (val >> 7) ^ (val >> 12)) & mask;
+}
+
+static struct fib_info *fib_find_info(const struct fib_info *nfi)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct fib_info *fi;
+ unsigned int hash;
+
+ hash = fib_info_hashfn(nfi);
+ head = &fib_info_hash[hash];
+
+ hlist_for_each_entry(fi, node, head, fib_hash) {
+ if (fi->fib_nhs != nfi->fib_nhs)
+ continue;
+ if (nfi->fib_protocol == fi->fib_protocol &&
+ nfi->fib_prefsrc == fi->fib_prefsrc &&
+ nfi->fib_priority == fi->fib_priority &&
+ memcmp(nfi->fib_metrics, fi->fib_metrics,
+ sizeof(fi->fib_metrics)) == 0 &&
+ ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
+ (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
+ return fi;
+ }
+
+ return NULL;
+}
+
+static inline unsigned int fib_devindex_hashfn(unsigned int val)
+{
+ unsigned int mask = DEVINDEX_HASHSIZE - 1;
+
+ return (val ^
+ (val >> DEVINDEX_HASHBITS) ^
+ (val >> (DEVINDEX_HASHBITS * 2))) & mask;
+}
+
+/* Check, that the gateway is already configured.
+ Used only by redirect accept routine.
+ */
+
+int ip_fib_check_default(u32 gw, struct net_device *dev)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct fib_nh *nh;
+ unsigned int hash;
+
+ read_lock(&fib_info_lock);
+
+ hash = fib_devindex_hashfn(dev->ifindex);
+ head = &fib_info_devhash[hash];
+ hlist_for_each_entry(nh, node, head, nh_hash) {
+ if (nh->nh_dev == dev &&
+ nh->nh_gw == gw &&
+ !(nh->nh_flags&RTNH_F_DEAD)) {
+ read_unlock(&fib_info_lock);
+ return 0;
+ }
+ }
+
+ read_unlock(&fib_info_lock);
+
+ return -1;
+}
+
+void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
+ int z, int tb_id,
+ struct nlmsghdr *n, struct netlink_skb_parms *req)
+{
+ struct sk_buff *skb;
+ u32 pid = req ? req->pid : 0;
+ int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
+
+ skb = alloc_skb(size, GFP_KERNEL);
+ if (!skb)
+ return;
+
+ if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id,
+ fa->fa_type, fa->fa_scope, &key, z,
+ fa->fa_tos,
+ fa->fa_info) < 0) {
+ kfree_skb(skb);
+ return;
+ }
+ NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE;
+ if (n->nlmsg_flags&NLM_F_ECHO)
+ atomic_inc(&skb->users);
+ netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL);
+ if (n->nlmsg_flags&NLM_F_ECHO)
+ netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
+}
+
+/* Return the first fib alias matching TOS with
+ * priority less than or equal to PRIO.
+ */
+struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
+{
+ if (fah) {
+ struct fib_alias *fa;
+ list_for_each_entry(fa, fah, fa_list) {
+ if (fa->fa_tos > tos)
+ continue;
+ if (fa->fa_info->fib_priority >= prio ||
+ fa->fa_tos < tos)
+ return fa;
+ }
+ }
+ return NULL;
+}
+
+int fib_detect_death(struct fib_info *fi, int order,
+ struct fib_info **last_resort, int *last_idx, int *dflt)
+{
+ struct neighbour *n;
+ int state = NUD_NONE;
+
+ n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
+ if (n) {
+ state = n->nud_state;
+ neigh_release(n);
+ }
+ if (state==NUD_REACHABLE)
+ return 0;
+ if ((state&NUD_VALID) && order != *dflt)
+ return 0;
+ if ((state&NUD_VALID) ||
+ (*last_idx<0 && order > *dflt)) {
+ *last_resort = fi;
+ *last_idx = order;
+ }
+ return 1;
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type)
+{
+ while (RTA_OK(attr,attrlen)) {
+ if (attr->rta_type == type)
+ return *(u32*)RTA_DATA(attr);
+ attr = RTA_NEXT(attr, attrlen);
+ }
+ return 0;
+}
+
+static int
+fib_count_nexthops(struct rtattr *rta)
+{
+ int nhs = 0;
+ struct rtnexthop *nhp = RTA_DATA(rta);
+ int nhlen = RTA_PAYLOAD(rta);
+
+ while (nhlen >= (int)sizeof(struct rtnexthop)) {
+ if ((nhlen -= nhp->rtnh_len) < 0)
+ return 0;
+ nhs++;
+ nhp = RTNH_NEXT(nhp);
+ };
+ return nhs;
+}
+
+static int
+fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r)
+{
+ struct rtnexthop *nhp = RTA_DATA(rta);
+ int nhlen = RTA_PAYLOAD(rta);
+
+ change_nexthops(fi) {
+ int attrlen = nhlen - sizeof(struct rtnexthop);
+ if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
+ return -EINVAL;
+ nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags;
+ nh->nh_oif = nhp->rtnh_ifindex;
+ nh->nh_weight = nhp->rtnh_hops + 1;
+ if (attrlen) {
+ nh->nh_gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
+#ifdef CONFIG_NET_CLS_ROUTE
+ nh->nh_tclassid = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
+#endif
+ }
+ nhp = RTNH_NEXT(nhp);
+ } endfor_nexthops(fi);
+ return 0;
+}
+
+#endif
+
+int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta,
+ struct fib_info *fi)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ struct rtnexthop *nhp;
+ int nhlen;
+#endif
+
+ if (rta->rta_priority &&
+ *rta->rta_priority != fi->fib_priority)
+ return 1;
+
+ if (rta->rta_oif || rta->rta_gw) {
+ if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) &&
+ (!rta->rta_gw || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 4) == 0))
+ return 0;
+ return 1;
+ }
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (rta->rta_mp == NULL)
+ return 0;
+ nhp = RTA_DATA(rta->rta_mp);
+ nhlen = RTA_PAYLOAD(rta->rta_mp);
+
+ for_nexthops(fi) {
+ int attrlen = nhlen - sizeof(struct rtnexthop);
+ u32 gw;
+
+ if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
+ return -EINVAL;
+ if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif)
+ return 1;
+ if (attrlen) {
+ gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
+ if (gw && gw != nh->nh_gw)
+ return 1;
+#ifdef CONFIG_NET_CLS_ROUTE
+ gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
+ if (gw && gw != nh->nh_tclassid)
+ return 1;
+#endif
+ }
+ nhp = RTNH_NEXT(nhp);
+ } endfor_nexthops(fi);
+#endif
+ return 0;
+}
+
+
+/*
+ Picture
+ -------
+
+ Semantics of nexthop is very messy by historical reasons.
+ We have to take into account, that:
+ a) gateway can be actually local interface address,
+ so that gatewayed route is direct.
+ b) gateway must be on-link address, possibly
+ described not by an ifaddr, but also by a direct route.
+ c) If both gateway and interface are specified, they should not
+ contradict.
+ d) If we use tunnel routes, gateway could be not on-link.
+
+ Attempt to reconcile all of these (alas, self-contradictory) conditions
+ results in pretty ugly and hairy code with obscure logic.
+
+ I chose to generalized it instead, so that the size
+ of code does not increase practically, but it becomes
+ much more general.
+ Every prefix is assigned a "scope" value: "host" is local address,
+ "link" is direct route,
+ [ ... "site" ... "interior" ... ]
+ and "universe" is true gateway route with global meaning.
+
+ Every prefix refers to a set of "nexthop"s (gw, oif),
+ where gw must have narrower scope. This recursion stops
+ when gw has LOCAL scope or if "nexthop" is declared ONLINK,
+ which means that gw is forced to be on link.
+
+ Code is still hairy, but now it is apparently logically
+ consistent and very flexible. F.e. as by-product it allows
+ to co-exists in peace independent exterior and interior
+ routing processes.
+
+ Normally it looks as following.
+
+ {universe prefix} -> (gw, oif) [scope link]
+ |
+ |-> {link prefix} -> (gw, oif) [scope local]
+ |
+ |-> {local prefix} (terminal node)
+ */
+
+static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_nh *nh)
+{
+ int err;
+
+ if (nh->nh_gw) {
+ struct fib_result res;
+
+#ifdef CONFIG_IP_ROUTE_PERVASIVE
+ if (nh->nh_flags&RTNH_F_PERVASIVE)
+ return 0;
+#endif
+ if (nh->nh_flags&RTNH_F_ONLINK) {
+ struct net_device *dev;
+
+ if (r->rtm_scope >= RT_SCOPE_LINK)
+ return -EINVAL;
+ if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
+ return -EINVAL;
+ if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
+ return -ENODEV;
+ if (!(dev->flags&IFF_UP))
+ return -ENETDOWN;
+ nh->nh_dev = dev;
+ dev_hold(dev);
+ nh->nh_scope = RT_SCOPE_LINK;
+ return 0;
+ }
+ {
+ struct flowi fl = { .nl_u = { .ip4_u =
+ { .daddr = nh->nh_gw,
+ .scope = r->rtm_scope + 1 } },
+ .oif = nh->nh_oif };
+
+ /* It is not necessary, but requires a bit of thinking */
+ if (fl.fl4_scope < RT_SCOPE_LINK)
+ fl.fl4_scope = RT_SCOPE_LINK;
+ if ((err = fib_lookup(&fl, &res)) != 0)
+ return err;
+ }
+ err = -EINVAL;
+ if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
+ goto out;
+ nh->nh_scope = res.scope;
+ nh->nh_oif = FIB_RES_OIF(res);
+ if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
+ goto out;
+ dev_hold(nh->nh_dev);
+ err = -ENETDOWN;
+ if (!(nh->nh_dev->flags & IFF_UP))
+ goto out;
+ err = 0;
+out:
+ fib_res_put(&res);
+ return err;
+ } else {
+ struct in_device *in_dev;
+
+ if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
+ return -EINVAL;
+
+ in_dev = inetdev_by_index(nh->nh_oif);
+ if (in_dev == NULL)
+ return -ENODEV;
+ if (!(in_dev->dev->flags&IFF_UP)) {
+ in_dev_put(in_dev);
+ return -ENETDOWN;
+ }
+ nh->nh_dev = in_dev->dev;
+ dev_hold(nh->nh_dev);
+ nh->nh_scope = RT_SCOPE_HOST;
+ in_dev_put(in_dev);
+ }
+ return 0;
+}
+
+static inline unsigned int fib_laddr_hashfn(u32 val)
+{
+ unsigned int mask = (fib_hash_size - 1);
+
+ return (val ^ (val >> 7) ^ (val >> 14)) & mask;
+}
+
+static struct hlist_head *fib_hash_alloc(int bytes)
+{
+ if (bytes <= PAGE_SIZE)
+ return kmalloc(bytes, GFP_KERNEL);
+ else
+ return (struct hlist_head *)
+ __get_free_pages(GFP_KERNEL, get_order(bytes));
+}
+
+static void fib_hash_free(struct hlist_head *hash, int bytes)
+{
+ if (!hash)
+ return;
+
+ if (bytes <= PAGE_SIZE)
+ kfree(hash);
+ else
+ free_pages((unsigned long) hash, get_order(bytes));
+}
+
+static void fib_hash_move(struct hlist_head *new_info_hash,
+ struct hlist_head *new_laddrhash,
+ unsigned int new_size)
+{
+ unsigned int old_size = fib_hash_size;
+ unsigned int i;
+
+ write_lock(&fib_info_lock);
+ fib_hash_size = new_size;
+
+ for (i = 0; i < old_size; i++) {
+ struct hlist_head *head = &fib_info_hash[i];
+ struct hlist_node *node, *n;
+ struct fib_info *fi;
+
+ hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
+ struct hlist_head *dest;
+ unsigned int new_hash;
+
+ hlist_del(&fi->fib_hash);
+
+ new_hash = fib_info_hashfn(fi);
+ dest = &new_info_hash[new_hash];
+ hlist_add_head(&fi->fib_hash, dest);
+ }
+ }
+ fib_info_hash = new_info_hash;
+
+ for (i = 0; i < old_size; i++) {
+ struct hlist_head *lhead = &fib_info_laddrhash[i];
+ struct hlist_node *node, *n;
+ struct fib_info *fi;
+
+ hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
+ struct hlist_head *ldest;
+ unsigned int new_hash;
+
+ hlist_del(&fi->fib_lhash);
+
+ new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
+ ldest = &new_laddrhash[new_hash];
+ hlist_add_head(&fi->fib_lhash, ldest);
+ }
+ }
+ fib_info_laddrhash = new_laddrhash;
+
+ write_unlock(&fib_info_lock);
+}
+
+struct fib_info *
+fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
+ const struct nlmsghdr *nlh, int *errp)
+{
+ int err;
+ struct fib_info *fi = NULL;
+ struct fib_info *ofi;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ int nhs = 1;
+#else
+ const int nhs = 1;
+#endif
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+ u32 mp_alg = IP_MP_ALG_NONE;
+#endif
+
+ /* Fast check to catch the most weird cases */
+ if (fib_props[r->rtm_type].scope > r->rtm_scope)
+ goto err_inval;
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (rta->rta_mp) {
+ nhs = fib_count_nexthops(rta->rta_mp);
+ if (nhs == 0)
+ goto err_inval;
+ }
+#endif
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+ if (rta->rta_mp_alg) {
+ mp_alg = *rta->rta_mp_alg;
+
+ if (mp_alg < IP_MP_ALG_NONE ||
+ mp_alg > IP_MP_ALG_MAX)
+ goto err_inval;
+ }
+#endif
+
+ err = -ENOBUFS;
+ if (fib_info_cnt >= fib_hash_size) {
+ unsigned int new_size = fib_hash_size << 1;
+ struct hlist_head *new_info_hash;
+ struct hlist_head *new_laddrhash;
+ unsigned int bytes;
+
+ if (!new_size)
+ new_size = 1;
+ bytes = new_size * sizeof(struct hlist_head *);
+ new_info_hash = fib_hash_alloc(bytes);
+ new_laddrhash = fib_hash_alloc(bytes);
+ if (!new_info_hash || !new_laddrhash) {
+ fib_hash_free(new_info_hash, bytes);
+ fib_hash_free(new_laddrhash, bytes);
+ } else {
+ memset(new_info_hash, 0, bytes);
+ memset(new_laddrhash, 0, bytes);
+
+ fib_hash_move(new_info_hash, new_laddrhash, new_size);
+ }
+
+ if (!fib_hash_size)
+ goto failure;
+ }
+
+ fi = kmalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
+ if (fi == NULL)
+ goto failure;
+ fib_info_cnt++;
+ memset(fi, 0, sizeof(*fi)+nhs*sizeof(struct fib_nh));
+
+ fi->fib_protocol = r->rtm_protocol;
+
+ fi->fib_nhs = nhs;
+ change_nexthops(fi) {
+ nh->nh_parent = fi;
+ } endfor_nexthops(fi)
+
+ fi->fib_flags = r->rtm_flags;
+ if (rta->rta_priority)
+ fi->fib_priority = *rta->rta_priority;
+ if (rta->rta_mx) {
+ int attrlen = RTA_PAYLOAD(rta->rta_mx);
+ struct rtattr *attr = RTA_DATA(rta->rta_mx);
+
+ while (RTA_OK(attr, attrlen)) {
+ unsigned flavor = attr->rta_type;
+ if (flavor) {
+ if (flavor > RTAX_MAX)
+ goto err_inval;
+ fi->fib_metrics[flavor-1] = *(unsigned*)RTA_DATA(attr);
+ }
+ attr = RTA_NEXT(attr, attrlen);
+ }
+ }
+ if (rta->rta_prefsrc)
+ memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4);
+
+ if (rta->rta_mp) {
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if ((err = fib_get_nhs(fi, rta->rta_mp, r)) != 0)
+ goto failure;
+ if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif)
+ goto err_inval;
+ if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 4))
+ goto err_inval;
+#ifdef CONFIG_NET_CLS_ROUTE
+ if (rta->rta_flow && memcmp(&fi->fib_nh->nh_tclassid, rta->rta_flow, 4))
+ goto err_inval;
+#endif
+#else
+ goto err_inval;
+#endif
+ } else {
+ struct fib_nh *nh = fi->fib_nh;
+ if (rta->rta_oif)
+ nh->nh_oif = *rta->rta_oif;
+ if (rta->rta_gw)
+ memcpy(&nh->nh_gw, rta->rta_gw, 4);
+#ifdef CONFIG_NET_CLS_ROUTE
+ if (rta->rta_flow)
+ memcpy(&nh->nh_tclassid, rta->rta_flow, 4);
+#endif
+ nh->nh_flags = r->rtm_flags;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ nh->nh_weight = 1;
+#endif
+ }
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+ fi->fib_mp_alg = mp_alg;
+#endif
+
+ if (fib_props[r->rtm_type].error) {
+ if (rta->rta_gw || rta->rta_oif || rta->rta_mp)
+ goto err_inval;
+ goto link_it;
+ }
+
+ if (r->rtm_scope > RT_SCOPE_HOST)
+ goto err_inval;
+
+ if (r->rtm_scope == RT_SCOPE_HOST) {
+ struct fib_nh *nh = fi->fib_nh;
+
+ /* Local address is added. */
+ if (nhs != 1 || nh->nh_gw)
+ goto err_inval;
+ nh->nh_scope = RT_SCOPE_NOWHERE;
+ nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
+ err = -ENODEV;
+ if (nh->nh_dev == NULL)
+ goto failure;
+ } else {
+ change_nexthops(fi) {
+ if ((err = fib_check_nh(r, fi, nh)) != 0)
+ goto failure;
+ } endfor_nexthops(fi)
+ }
+
+ if (fi->fib_prefsrc) {
+ if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL ||
+ memcmp(&fi->fib_prefsrc, rta->rta_dst, 4))
+ if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
+ goto err_inval;
+ }
+
+link_it:
+ if ((ofi = fib_find_info(fi)) != NULL) {
+ fi->fib_dead = 1;
+ free_fib_info(fi);
+ ofi->fib_treeref++;
+ return ofi;
+ }
+
+ fi->fib_treeref++;
+ atomic_inc(&fi->fib_clntref);
+ write_lock(&fib_info_lock);
+ hlist_add_head(&fi->fib_hash,
+ &fib_info_hash[fib_info_hashfn(fi)]);
+ if (fi->fib_prefsrc) {
+ struct hlist_head *head;
+
+ head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
+ hlist_add_head(&fi->fib_lhash, head);
+ }
+ change_nexthops(fi) {
+ struct hlist_head *head;
+ unsigned int hash;
+
+ if (!nh->nh_dev)
+ continue;
+ hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
+ head = &fib_info_devhash[hash];
+ hlist_add_head(&nh->nh_hash, head);
+ } endfor_nexthops(fi)
+ write_unlock(&fib_info_lock);
+ return fi;
+
+err_inval:
+ err = -EINVAL;
+
+failure:
+ *errp = err;
+ if (fi) {
+ fi->fib_dead = 1;
+ free_fib_info(fi);
+ }
+ return NULL;
+}
+
+int fib_semantic_match(struct list_head *head, const struct flowi *flp,
+ struct fib_result *res, __u32 zone, __u32 mask,
+ int prefixlen)
+{
+ struct fib_alias *fa;
+ int nh_sel = 0;
+
+ list_for_each_entry(fa, head, fa_list) {
+ int err;
+
+ if (fa->fa_tos &&
+ fa->fa_tos != flp->fl4_tos)
+ continue;
+
+ if (fa->fa_scope < flp->fl4_scope)
+ continue;
+
+ fa->fa_state |= FA_S_ACCESSED;
+
+ err = fib_props[fa->fa_type].error;
+ if (err == 0) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (fi->fib_flags & RTNH_F_DEAD)
+ continue;
+
+ switch (fa->fa_type) {
+ case RTN_UNICAST:
+ case RTN_LOCAL:
+ case RTN_BROADCAST:
+ case RTN_ANYCAST:
+ case RTN_MULTICAST:
+ for_nexthops(fi) {
+ if (nh->nh_flags&RTNH_F_DEAD)
+ continue;
+ if (!flp->oif || flp->oif == nh->nh_oif)
+ break;
+ }
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (nhsel < fi->fib_nhs) {
+ nh_sel = nhsel;
+ goto out_fill_res;
+ }
+#else
+ if (nhsel < 1) {
+ goto out_fill_res;
+ }
+#endif
+ endfor_nexthops(fi);
+ continue;
+
+ default:
+ printk(KERN_DEBUG "impossible 102\n");
+ return -EINVAL;
+ };
+ }
+ return err;
+ }
+ return 1;
+
+out_fill_res:
+ res->prefixlen = prefixlen;
+ res->nh_sel = nh_sel;
+ res->type = fa->fa_type;
+ res->scope = fa->fa_scope;
+ res->fi = fa->fa_info;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+ res->netmask = mask;
+ res->network = zone &
+ (0xFFFFFFFF >> (32 - prefixlen));
+#endif
+ atomic_inc(&res->fi->fib_clntref);
+ return 0;
+}
+
+/* Find appropriate source address to this destination */
+
+u32 __fib_res_prefsrc(struct fib_result *res)
+{
+ return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
+}
+
+int
+fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+ u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
+ struct fib_info *fi)
+{
+ struct rtmsg *rtm;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb->tail;
+
+ nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm));
+ rtm = NLMSG_DATA(nlh);
+ rtm->rtm_family = AF_INET;
+ rtm->rtm_dst_len = dst_len;
+ rtm->rtm_src_len = 0;
+ rtm->rtm_tos = tos;
+ rtm->rtm_table = tb_id;
+ rtm->rtm_type = type;
+ rtm->rtm_flags = fi->fib_flags;
+ rtm->rtm_scope = scope;
+ if (rtm->rtm_dst_len)
+ RTA_PUT(skb, RTA_DST, 4, dst);
+ rtm->rtm_protocol = fi->fib_protocol;
+ if (fi->fib_priority)
+ RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority);
+#ifdef CONFIG_NET_CLS_ROUTE
+ if (fi->fib_nh[0].nh_tclassid)
+ RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid);
+#endif
+ if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
+ goto rtattr_failure;
+ if (fi->fib_prefsrc)
+ RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc);
+ if (fi->fib_nhs == 1) {
+ if (fi->fib_nh->nh_gw)
+ RTA_PUT(skb, RTA_GATEWAY, 4, &fi->fib_nh->nh_gw);
+ if (fi->fib_nh->nh_oif)
+ RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif);
+ }
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (fi->fib_nhs > 1) {
+ struct rtnexthop *nhp;
+ struct rtattr *mp_head;
+ if (skb_tailroom(skb) <= RTA_SPACE(0))
+ goto rtattr_failure;
+ mp_head = (struct rtattr*)skb_put(skb, RTA_SPACE(0));
+
+ for_nexthops(fi) {
+ if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
+ goto rtattr_failure;
+ nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
+ nhp->rtnh_flags = nh->nh_flags & 0xFF;
+ nhp->rtnh_hops = nh->nh_weight-1;
+ nhp->rtnh_ifindex = nh->nh_oif;
+ if (nh->nh_gw)
+ RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw);
+ nhp->rtnh_len = skb->tail - (unsigned char*)nhp;
+ } endfor_nexthops(fi);
+ mp_head->rta_type = RTA_MULTIPATH;
+ mp_head->rta_len = skb->tail - (u8*)mp_head;
+ }
+#endif
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+nlmsg_failure:
+rtattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+#ifndef CONFIG_IP_NOSIOCRT
+
+int
+fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
+ struct kern_rta *rta, struct rtentry *r)
+{
+ int plen;
+ u32 *ptr;
+
+ memset(rtm, 0, sizeof(*rtm));
+ memset(rta, 0, sizeof(*rta));
+
+ if (r->rt_dst.sa_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ /* Check mask for validity:
+ a) it must be contiguous.
+ b) destination must have all host bits clear.
+ c) if application forgot to set correct family (AF_INET),
+ reject request unless it is absolutely clear i.e.
+ both family and mask are zero.
+ */
+ plen = 32;
+ ptr = &((struct sockaddr_in*)&r->rt_dst)->sin_addr.s_addr;
+ if (!(r->rt_flags&RTF_HOST)) {
+ u32 mask = ((struct sockaddr_in*)&r->rt_genmask)->sin_addr.s_addr;
+ if (r->rt_genmask.sa_family != AF_INET) {
+ if (mask || r->rt_genmask.sa_family)
+ return -EAFNOSUPPORT;
+ }
+ if (bad_mask(mask, *ptr))
+ return -EINVAL;
+ plen = inet_mask_len(mask);
+ }
+
+ nl->nlmsg_flags = NLM_F_REQUEST;
+ nl->nlmsg_pid = 0;
+ nl->nlmsg_seq = 0;
+ nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm));
+ if (cmd == SIOCDELRT) {
+ nl->nlmsg_type = RTM_DELROUTE;
+ nl->nlmsg_flags = 0;
+ } else {
+ nl->nlmsg_type = RTM_NEWROUTE;
+ nl->nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE;
+ rtm->rtm_protocol = RTPROT_BOOT;
+ }
+
+ rtm->rtm_dst_len = plen;
+ rta->rta_dst = ptr;
+
+ if (r->rt_metric) {
+ *(u32*)&r->rt_pad3 = r->rt_metric - 1;
+ rta->rta_priority = (u32*)&r->rt_pad3;
+ }
+ if (r->rt_flags&RTF_REJECT) {
+ rtm->rtm_scope = RT_SCOPE_HOST;
+ rtm->rtm_type = RTN_UNREACHABLE;
+ return 0;
+ }
+ rtm->rtm_scope = RT_SCOPE_NOWHERE;
+ rtm->rtm_type = RTN_UNICAST;
+
+ if (r->rt_dev) {
+ char *colon;
+ struct net_device *dev;
+ char devname[IFNAMSIZ];
+
+ if (copy_from_user(devname, r->rt_dev, IFNAMSIZ-1))
+ return -EFAULT;
+ devname[IFNAMSIZ-1] = 0;
+ colon = strchr(devname, ':');
+ if (colon)
+ *colon = 0;
+ dev = __dev_get_by_name(devname);
+ if (!dev)
+ return -ENODEV;
+ rta->rta_oif = &dev->ifindex;
+ if (colon) {
+ struct in_ifaddr *ifa;
+ struct in_device *in_dev = __in_dev_get(dev);
+ if (!in_dev)
+ return -ENODEV;
+ *colon = ':';
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
+ if (strcmp(ifa->ifa_label, devname) == 0)
+ break;
+ if (ifa == NULL)
+ return -ENODEV;
+ rta->rta_prefsrc = &ifa->ifa_local;
+ }
+ }
+
+ ptr = &((struct sockaddr_in*)&r->rt_gateway)->sin_addr.s_addr;
+ if (r->rt_gateway.sa_family == AF_INET && *ptr) {
+ rta->rta_gw = ptr;
+ if (r->rt_flags&RTF_GATEWAY && inet_addr_type(*ptr) == RTN_UNICAST)
+ rtm->rtm_scope = RT_SCOPE_UNIVERSE;
+ }
+
+ if (cmd == SIOCDELRT)
+ return 0;
+
+ if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL)
+ return -EINVAL;
+
+ if (rtm->rtm_scope == RT_SCOPE_NOWHERE)
+ rtm->rtm_scope = RT_SCOPE_LINK;
+
+ if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) {
+ struct rtattr *rec;
+ struct rtattr *mx = kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL);
+ if (mx == NULL)
+ return -ENOMEM;
+ rta->rta_mx = mx;
+ mx->rta_type = RTA_METRICS;
+ mx->rta_len = RTA_LENGTH(0);
+ if (r->rt_flags&RTF_MTU) {
+ rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
+ rec->rta_type = RTAX_ADVMSS;
+ rec->rta_len = RTA_LENGTH(4);
+ mx->rta_len += RTA_LENGTH(4);
+ *(u32*)RTA_DATA(rec) = r->rt_mtu - 40;
+ }
+ if (r->rt_flags&RTF_WINDOW) {
+ rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
+ rec->rta_type = RTAX_WINDOW;
+ rec->rta_len = RTA_LENGTH(4);
+ mx->rta_len += RTA_LENGTH(4);
+ *(u32*)RTA_DATA(rec) = r->rt_window;
+ }
+ if (r->rt_flags&RTF_IRTT) {
+ rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
+ rec->rta_type = RTAX_RTT;
+ rec->rta_len = RTA_LENGTH(4);
+ mx->rta_len += RTA_LENGTH(4);
+ *(u32*)RTA_DATA(rec) = r->rt_irtt<<3;
+ }
+ }
+ return 0;
+}
+
+#endif
+
+/*
+ Update FIB if:
+ - local address disappeared -> we must delete all the entries
+ referring to it.
+ - device went down -> we must shutdown all nexthops going via it.
+ */
+
+int fib_sync_down(u32 local, struct net_device *dev, int force)
+{
+ int ret = 0;
+ int scope = RT_SCOPE_NOWHERE;
+
+ if (force)
+ scope = -1;
+
+ if (local && fib_info_laddrhash) {
+ unsigned int hash = fib_laddr_hashfn(local);
+ struct hlist_head *head = &fib_info_laddrhash[hash];
+ struct hlist_node *node;
+ struct fib_info *fi;
+
+ hlist_for_each_entry(fi, node, head, fib_lhash) {
+ if (fi->fib_prefsrc == local) {
+ fi->fib_flags |= RTNH_F_DEAD;
+ ret++;
+ }
+ }
+ }
+
+ if (dev) {
+ struct fib_info *prev_fi = NULL;
+ unsigned int hash = fib_devindex_hashfn(dev->ifindex);
+ struct hlist_head *head = &fib_info_devhash[hash];
+ struct hlist_node *node;
+ struct fib_nh *nh;
+
+ hlist_for_each_entry(nh, node, head, nh_hash) {
+ struct fib_info *fi = nh->nh_parent;
+ int dead;
+
+ BUG_ON(!fi->fib_nhs);
+ if (nh->nh_dev != dev || fi == prev_fi)
+ continue;
+ prev_fi = fi;
+ dead = 0;
+ change_nexthops(fi) {
+ if (nh->nh_flags&RTNH_F_DEAD)
+ dead++;
+ else if (nh->nh_dev == dev &&
+ nh->nh_scope != scope) {
+ nh->nh_flags |= RTNH_F_DEAD;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ spin_lock_bh(&fib_multipath_lock);
+ fi->fib_power -= nh->nh_power;
+ nh->nh_power = 0;
+ spin_unlock_bh(&fib_multipath_lock);
+#endif
+ dead++;
+ }
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (force > 1 && nh->nh_dev == dev) {
+ dead = fi->fib_nhs;
+ break;
+ }
+#endif
+ } endfor_nexthops(fi)
+ if (dead == fi->fib_nhs) {
+ fi->fib_flags |= RTNH_F_DEAD;
+ ret++;
+ }
+ }
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+/*
+ Dead device goes up. We wake up dead nexthops.
+ It takes sense only on multipath routes.
+ */
+
+int fib_sync_up(struct net_device *dev)
+{
+ struct fib_info *prev_fi;
+ unsigned int hash;
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct fib_nh *nh;
+ int ret;
+
+ if (!(dev->flags&IFF_UP))
+ return 0;
+
+ prev_fi = NULL;
+ hash = fib_devindex_hashfn(dev->ifindex);
+ head = &fib_info_devhash[hash];
+ ret = 0;
+
+ hlist_for_each_entry(nh, node, head, nh_hash) {
+ struct fib_info *fi = nh->nh_parent;
+ int alive;
+
+ BUG_ON(!fi->fib_nhs);
+ if (nh->nh_dev != dev || fi == prev_fi)
+ continue;
+
+ prev_fi = fi;
+ alive = 0;
+ change_nexthops(fi) {
+ if (!(nh->nh_flags&RTNH_F_DEAD)) {
+ alive++;
+ continue;
+ }
+ if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
+ continue;
+ if (nh->nh_dev != dev || __in_dev_get(dev) == NULL)
+ continue;
+ alive++;
+ spin_lock_bh(&fib_multipath_lock);
+ nh->nh_power = 0;
+ nh->nh_flags &= ~RTNH_F_DEAD;
+ spin_unlock_bh(&fib_multipath_lock);
+ } endfor_nexthops(fi)
+
+ if (alive > 0) {
+ fi->fib_flags &= ~RTNH_F_DEAD;
+ ret++;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ The algorithm is suboptimal, but it provides really
+ fair weighted route distribution.
+ */
+
+void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
+{
+ struct fib_info *fi = res->fi;
+ int w;
+
+ spin_lock_bh(&fib_multipath_lock);
+ if (fi->fib_power <= 0) {
+ int power = 0;
+ change_nexthops(fi) {
+ if (!(nh->nh_flags&RTNH_F_DEAD)) {
+ power += nh->nh_weight;
+ nh->nh_power = nh->nh_weight;
+ }
+ } endfor_nexthops(fi);
+ fi->fib_power = power;
+ if (power <= 0) {
+ spin_unlock_bh(&fib_multipath_lock);
+ /* Race condition: route has just become dead. */
+ res->nh_sel = 0;
+ return;
+ }
+ }
+
+
+ /* w should be random number [0..fi->fib_power-1],
+ it is pretty bad approximation.
+ */
+
+ w = jiffies % fi->fib_power;
+
+ change_nexthops(fi) {
+ if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
+ if ((w -= nh->nh_power) <= 0) {
+ nh->nh_power--;
+ fi->fib_power--;
+ res->nh_sel = nhsel;
+ spin_unlock_bh(&fib_multipath_lock);
+ return;
+ }
+ }
+ } endfor_nexthops(fi);
+
+ /* Race condition: route has just become dead. */
+ res->nh_sel = 0;
+ spin_unlock_bh(&fib_multipath_lock);
+}
+#endif
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
new file mode 100644
index 000000000000..85bf0d3e294b
--- /dev/null
+++ b/net/ipv4/icmp.c
@@ -0,0 +1,1143 @@
+/*
+ * NET3: Implementation of the ICMP protocol layer.
+ *
+ * Alan Cox, <alan@redhat.com>
+ *
+ * Version: $Id: icmp.c,v 1.85 2002/02/01 22:01:03 davem Exp $
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Some of the function names and the icmp unreach table for this
+ * module were derived from [icmp.c 1.0.11 06/02/93] by
+ * Ross Biro, Fred N. van Kempen, Mark Evans, Alan Cox, Gerhard Koerting.
+ * Other than that this module is a complete rewrite.
+ *
+ * Fixes:
+ * Clemens Fruhwirth : introduce global icmp rate limiting
+ * with icmp type masking ability instead
+ * of broken per type icmp timeouts.
+ * Mike Shaver : RFC1122 checks.
+ * Alan Cox : Multicast ping reply as self.
+ * Alan Cox : Fix atomicity lockup in ip_build_xmit
+ * call.
+ * Alan Cox : Added 216,128 byte paths to the MTU
+ * code.
+ * Martin Mares : RFC1812 checks.
+ * Martin Mares : Can be configured to follow redirects
+ * if acting as a router _without_ a
+ * routing protocol (RFC 1812).
+ * Martin Mares : Echo requests may be configured to
+ * be ignored (RFC 1812).
+ * Martin Mares : Limitation of ICMP error message
+ * transmit rate (RFC 1812).
+ * Martin Mares : TOS and Precedence set correctly
+ * (RFC 1812).
+ * Martin Mares : Now copying as much data from the
+ * original packet as we can without
+ * exceeding 576 bytes (RFC 1812).
+ * Willy Konynenberg : Transparent proxying support.
+ * Keith Owens : RFC1191 correction for 4.2BSD based
+ * path MTU bug.
+ * Thomas Quinot : ICMP Dest Unreach codes up to 15 are
+ * valid (RFC 1812).
+ * Andi Kleen : Check all packet lengths properly
+ * and moved all kfree_skb() up to
+ * icmp_rcv.
+ * Andi Kleen : Move the rate limit bookkeeping
+ * into the dest entry and use a token
+ * bucket filter (thanks to ANK). Make
+ * the rates sysctl configurable.
+ * Yu Tianli : Fixed two ugly bugs in icmp_send
+ * - IP option length was accounted wrongly
+ * - ICMP header length was not accounted
+ * at all.
+ * Tristan Greaves : Added sysctl option to ignore bogus
+ * broadcast responses from broken routers.
+ *
+ * To Fix:
+ *
+ * - Should use skb_pull() instead of all the manual checking.
+ * This would also greatly simply some upper layer error handlers. --AK
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/protocol.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/init.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <net/checksum.h>
+
+/*
+ * Build xmit assembly blocks
+ */
+
+struct icmp_bxm {
+ struct sk_buff *skb;
+ int offset;
+ int data_len;
+
+ struct {
+ struct icmphdr icmph;
+ __u32 times[3];
+ } data;
+ int head_len;
+ struct ip_options replyopts;
+ unsigned char optbuf[40];
+};
+
+/*
+ * Statistics
+ */
+DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics);
+
+/* An array of errno for error messages from dest unreach. */
+/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
+
+struct icmp_err icmp_err_convert[] = {
+ {
+ .errno = ENETUNREACH, /* ICMP_NET_UNREACH */
+ .fatal = 0,
+ },
+ {
+ .errno = EHOSTUNREACH, /* ICMP_HOST_UNREACH */
+ .fatal = 0,
+ },
+ {
+ .errno = ENOPROTOOPT /* ICMP_PROT_UNREACH */,
+ .fatal = 1,
+ },
+ {
+ .errno = ECONNREFUSED, /* ICMP_PORT_UNREACH */
+ .fatal = 1,
+ },
+ {
+ .errno = EMSGSIZE, /* ICMP_FRAG_NEEDED */
+ .fatal = 0,
+ },
+ {
+ .errno = EOPNOTSUPP, /* ICMP_SR_FAILED */
+ .fatal = 0,
+ },
+ {
+ .errno = ENETUNREACH, /* ICMP_NET_UNKNOWN */
+ .fatal = 1,
+ },
+ {
+ .errno = EHOSTDOWN, /* ICMP_HOST_UNKNOWN */
+ .fatal = 1,
+ },
+ {
+ .errno = ENONET, /* ICMP_HOST_ISOLATED */
+ .fatal = 1,
+ },
+ {
+ .errno = ENETUNREACH, /* ICMP_NET_ANO */
+ .fatal = 1,
+ },
+ {
+ .errno = EHOSTUNREACH, /* ICMP_HOST_ANO */
+ .fatal = 1,
+ },
+ {
+ .errno = ENETUNREACH, /* ICMP_NET_UNR_TOS */
+ .fatal = 0,
+ },
+ {
+ .errno = EHOSTUNREACH, /* ICMP_HOST_UNR_TOS */
+ .fatal = 0,
+ },
+ {
+ .errno = EHOSTUNREACH, /* ICMP_PKT_FILTERED */
+ .fatal = 1,
+ },
+ {
+ .errno = EHOSTUNREACH, /* ICMP_PREC_VIOLATION */
+ .fatal = 1,
+ },
+ {
+ .errno = EHOSTUNREACH, /* ICMP_PREC_CUTOFF */
+ .fatal = 1,
+ },
+};
+
+/* Control parameters for ECHO replies. */
+int sysctl_icmp_echo_ignore_all;
+int sysctl_icmp_echo_ignore_broadcasts;
+
+/* Control parameter - ignore bogus broadcast responses? */
+int sysctl_icmp_ignore_bogus_error_responses;
+
+/*
+ * Configurable global rate limit.
+ *
+ * ratelimit defines tokens/packet consumed for dst->rate_token bucket
+ * ratemask defines which icmp types are ratelimited by setting
+ * it's bit position.
+ *
+ * default:
+ * dest unreachable (3), source quench (4),
+ * time exceeded (11), parameter problem (12)
+ */
+
+int sysctl_icmp_ratelimit = 1 * HZ;
+int sysctl_icmp_ratemask = 0x1818;
+
+/*
+ * ICMP control array. This specifies what to do with each ICMP.
+ */
+
+struct icmp_control {
+ int output_entry; /* Field for increment on output */
+ int input_entry; /* Field for increment on input */
+ void (*handler)(struct sk_buff *skb);
+ short error; /* This ICMP is classed as an error message */
+};
+
+static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
+
+/*
+ * The ICMP socket(s). This is the most convenient way to flow control
+ * our ICMP output as well as maintain a clean interface throughout
+ * all layers. All Socketless IP sends will soon be gone.
+ *
+ * On SMP we have one ICMP socket per-cpu.
+ */
+static DEFINE_PER_CPU(struct socket *, __icmp_socket) = NULL;
+#define icmp_socket __get_cpu_var(__icmp_socket)
+
+static __inline__ int icmp_xmit_lock(void)
+{
+ local_bh_disable();
+
+ if (unlikely(!spin_trylock(&icmp_socket->sk->sk_lock.slock))) {
+ /* This can happen if the output path signals a
+ * dst_link_failure() for an outgoing ICMP packet.
+ */
+ local_bh_enable();
+ return 1;
+ }
+ return 0;
+}
+
+static void icmp_xmit_unlock(void)
+{
+ spin_unlock_bh(&icmp_socket->sk->sk_lock.slock);
+}
+
+/*
+ * Send an ICMP frame.
+ */
+
+/*
+ * Check transmit rate limitation for given message.
+ * The rate information is held in the destination cache now.
+ * This function is generic and could be used for other purposes
+ * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
+ *
+ * Note that the same dst_entry fields are modified by functions in
+ * route.c too, but these work for packet destinations while xrlim_allow
+ * works for icmp destinations. This means the rate limiting information
+ * for one "ip object" is shared - and these ICMPs are twice limited:
+ * by source and by destination.
+ *
+ * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
+ * SHOULD allow setting of rate limits
+ *
+ * Shared between ICMPv4 and ICMPv6.
+ */
+#define XRLIM_BURST_FACTOR 6
+int xrlim_allow(struct dst_entry *dst, int timeout)
+{
+ unsigned long now;
+ int rc = 0;
+
+ now = jiffies;
+ dst->rate_tokens += now - dst->rate_last;
+ dst->rate_last = now;
+ if (dst->rate_tokens > XRLIM_BURST_FACTOR * timeout)
+ dst->rate_tokens = XRLIM_BURST_FACTOR * timeout;
+ if (dst->rate_tokens >= timeout) {
+ dst->rate_tokens -= timeout;
+ rc = 1;
+ }
+ return rc;
+}
+
+static inline int icmpv4_xrlim_allow(struct rtable *rt, int type, int code)
+{
+ struct dst_entry *dst = &rt->u.dst;
+ int rc = 1;
+
+ if (type > NR_ICMP_TYPES)
+ goto out;
+
+ /* Don't limit PMTU discovery. */
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+ goto out;
+
+ /* No rate limit on loopback */
+ if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
+ goto out;
+
+ /* Limit if icmp type is enabled in ratemask. */
+ if ((1 << type) & sysctl_icmp_ratemask)
+ rc = xrlim_allow(dst, sysctl_icmp_ratelimit);
+out:
+ return rc;
+}
+
+/*
+ * Maintain the counters used in the SNMP statistics for outgoing ICMP
+ */
+static void icmp_out_count(int type)
+{
+ if (type <= NR_ICMP_TYPES) {
+ ICMP_INC_STATS(icmp_pointers[type].output_entry);
+ ICMP_INC_STATS(ICMP_MIB_OUTMSGS);
+ }
+}
+
+/*
+ * Checksum each fragment, and on the first include the headers and final
+ * checksum.
+ */
+static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
+ struct sk_buff *skb)
+{
+ struct icmp_bxm *icmp_param = (struct icmp_bxm *)from;
+ unsigned int csum;
+
+ csum = skb_copy_and_csum_bits(icmp_param->skb,
+ icmp_param->offset + offset,
+ to, len, 0);
+
+ skb->csum = csum_block_add(skb->csum, csum, odd);
+ if (icmp_pointers[icmp_param->data.icmph.type].error)
+ nf_ct_attach(skb, icmp_param->skb);
+ return 0;
+}
+
+static void icmp_push_reply(struct icmp_bxm *icmp_param,
+ struct ipcm_cookie *ipc, struct rtable *rt)
+{
+ struct sk_buff *skb;
+
+ ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param,
+ icmp_param->data_len+icmp_param->head_len,
+ icmp_param->head_len,
+ ipc, rt, MSG_DONTWAIT);
+
+ if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) {
+ struct icmphdr *icmph = skb->h.icmph;
+ unsigned int csum = 0;
+ struct sk_buff *skb1;
+
+ skb_queue_walk(&icmp_socket->sk->sk_write_queue, skb1) {
+ csum = csum_add(csum, skb1->csum);
+ }
+ csum = csum_partial_copy_nocheck((void *)&icmp_param->data,
+ (char *)icmph,
+ icmp_param->head_len, csum);
+ icmph->checksum = csum_fold(csum);
+ skb->ip_summed = CHECKSUM_NONE;
+ ip_push_pending_frames(icmp_socket->sk);
+ }
+}
+
+/*
+ * Driving logic for building and sending ICMP messages.
+ */
+
+static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
+{
+ struct sock *sk = icmp_socket->sk;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipcm_cookie ipc;
+ struct rtable *rt = (struct rtable *)skb->dst;
+ u32 daddr;
+
+ if (ip_options_echo(&icmp_param->replyopts, skb))
+ goto out;
+
+ if (icmp_xmit_lock())
+ return;
+
+ icmp_param->data.icmph.checksum = 0;
+ icmp_out_count(icmp_param->data.icmph.type);
+
+ inet->tos = skb->nh.iph->tos;
+ daddr = ipc.addr = rt->rt_src;
+ ipc.opt = NULL;
+ if (icmp_param->replyopts.optlen) {
+ ipc.opt = &icmp_param->replyopts;
+ if (ipc.opt->srr)
+ daddr = icmp_param->replyopts.faddr;
+ }
+ {
+ struct flowi fl = { .nl_u = { .ip4_u =
+ { .daddr = daddr,
+ .saddr = rt->rt_spec_dst,
+ .tos = RT_TOS(skb->nh.iph->tos) } },
+ .proto = IPPROTO_ICMP };
+ if (ip_route_output_key(&rt, &fl))
+ goto out_unlock;
+ }
+ if (icmpv4_xrlim_allow(rt, icmp_param->data.icmph.type,
+ icmp_param->data.icmph.code))
+ icmp_push_reply(icmp_param, &ipc, rt);
+ ip_rt_put(rt);
+out_unlock:
+ icmp_xmit_unlock();
+out:;
+}
+
+
+/*
+ * Send an ICMP message in response to a situation
+ *
+ * RFC 1122: 3.2.2 MUST send at least the IP header and 8 bytes of header.
+ * MAY send more (we do).
+ * MUST NOT change this header information.
+ * MUST NOT reply to a multicast/broadcast IP address.
+ * MUST NOT reply to a multicast/broadcast MAC address.
+ * MUST reply to only the first fragment.
+ */
+
+void icmp_send(struct sk_buff *skb_in, int type, int code, u32 info)
+{
+ struct iphdr *iph;
+ int room;
+ struct icmp_bxm icmp_param;
+ struct rtable *rt = (struct rtable *)skb_in->dst;
+ struct ipcm_cookie ipc;
+ u32 saddr;
+ u8 tos;
+
+ if (!rt)
+ goto out;
+
+ /*
+ * Find the original header. It is expected to be valid, of course.
+ * Check this, icmp_send is called from the most obscure devices
+ * sometimes.
+ */
+ iph = skb_in->nh.iph;
+
+ if ((u8 *)iph < skb_in->head || (u8 *)(iph + 1) > skb_in->tail)
+ goto out;
+
+ /*
+ * No replies to physical multicast/broadcast
+ */
+ if (skb_in->pkt_type != PACKET_HOST)
+ goto out;
+
+ /*
+ * Now check at the protocol level
+ */
+ if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+ goto out;
+
+ /*
+ * Only reply to fragment 0. We byte re-order the constant
+ * mask for efficiency.
+ */
+ if (iph->frag_off & htons(IP_OFFSET))
+ goto out;
+
+ /*
+ * If we send an ICMP error to an ICMP error a mess would result..
+ */
+ if (icmp_pointers[type].error) {
+ /*
+ * We are an error, check if we are replying to an
+ * ICMP error
+ */
+ if (iph->protocol == IPPROTO_ICMP) {
+ u8 _inner_type, *itp;
+
+ itp = skb_header_pointer(skb_in,
+ skb_in->nh.raw +
+ (iph->ihl << 2) +
+ offsetof(struct icmphdr,
+ type) -
+ skb_in->data,
+ sizeof(_inner_type),
+ &_inner_type);
+ if (itp == NULL)
+ goto out;
+
+ /*
+ * Assume any unknown ICMP type is an error. This
+ * isn't specified by the RFC, but think about it..
+ */
+ if (*itp > NR_ICMP_TYPES ||
+ icmp_pointers[*itp].error)
+ goto out;
+ }
+ }
+
+ if (icmp_xmit_lock())
+ return;
+
+ /*
+ * Construct source address and options.
+ */
+
+ saddr = iph->daddr;
+ if (!(rt->rt_flags & RTCF_LOCAL))
+ saddr = 0;
+
+ tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
+ IPTOS_PREC_INTERNETCONTROL) :
+ iph->tos;
+
+ if (ip_options_echo(&icmp_param.replyopts, skb_in))
+ goto ende;
+
+
+ /*
+ * Prepare data for ICMP header.
+ */
+
+ icmp_param.data.icmph.type = type;
+ icmp_param.data.icmph.code = code;
+ icmp_param.data.icmph.un.gateway = info;
+ icmp_param.data.icmph.checksum = 0;
+ icmp_param.skb = skb_in;
+ icmp_param.offset = skb_in->nh.raw - skb_in->data;
+ icmp_out_count(icmp_param.data.icmph.type);
+ inet_sk(icmp_socket->sk)->tos = tos;
+ ipc.addr = iph->saddr;
+ ipc.opt = &icmp_param.replyopts;
+
+ {
+ struct flowi fl = {
+ .nl_u = {
+ .ip4_u = {
+ .daddr = icmp_param.replyopts.srr ?
+ icmp_param.replyopts.faddr :
+ iph->saddr,
+ .saddr = saddr,
+ .tos = RT_TOS(tos)
+ }
+ },
+ .proto = IPPROTO_ICMP,
+ .uli_u = {
+ .icmpt = {
+ .type = type,
+ .code = code
+ }
+ }
+ };
+ if (ip_route_output_key(&rt, &fl))
+ goto out_unlock;
+ }
+
+ if (!icmpv4_xrlim_allow(rt, type, code))
+ goto ende;
+
+ /* RFC says return as much as we can without exceeding 576 bytes. */
+
+ room = dst_mtu(&rt->u.dst);
+ if (room > 576)
+ room = 576;
+ room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen;
+ room -= sizeof(struct icmphdr);
+
+ icmp_param.data_len = skb_in->len - icmp_param.offset;
+ if (icmp_param.data_len > room)
+ icmp_param.data_len = room;
+ icmp_param.head_len = sizeof(struct icmphdr);
+
+ icmp_push_reply(&icmp_param, &ipc, rt);
+ende:
+ ip_rt_put(rt);
+out_unlock:
+ icmp_xmit_unlock();
+out:;
+}
+
+
+/*
+ * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH.
+ */
+
+static void icmp_unreach(struct sk_buff *skb)
+{
+ struct iphdr *iph;
+ struct icmphdr *icmph;
+ int hash, protocol;
+ struct net_protocol *ipprot;
+ struct sock *raw_sk;
+ u32 info = 0;
+
+ /*
+ * Incomplete header ?
+ * Only checks for the IP header, there should be an
+ * additional check for longer headers in upper levels.
+ */
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto out_err;
+
+ icmph = skb->h.icmph;
+ iph = (struct iphdr *)skb->data;
+
+ if (iph->ihl < 5) /* Mangled header, drop. */
+ goto out_err;
+
+ if (icmph->type == ICMP_DEST_UNREACH) {
+ switch (icmph->code & 15) {
+ case ICMP_NET_UNREACH:
+ case ICMP_HOST_UNREACH:
+ case ICMP_PROT_UNREACH:
+ case ICMP_PORT_UNREACH:
+ break;
+ case ICMP_FRAG_NEEDED:
+ if (ipv4_config.no_pmtu_disc) {
+ LIMIT_NETDEBUG(
+ printk(KERN_INFO "ICMP: %u.%u.%u.%u: "
+ "fragmentation needed "
+ "and DF set.\n",
+ NIPQUAD(iph->daddr)));
+ } else {
+ info = ip_rt_frag_needed(iph,
+ ntohs(icmph->un.frag.mtu));
+ if (!info)
+ goto out;
+ }
+ break;
+ case ICMP_SR_FAILED:
+ LIMIT_NETDEBUG(
+ printk(KERN_INFO "ICMP: %u.%u.%u.%u: Source "
+ "Route Failed.\n",
+ NIPQUAD(iph->daddr)));
+ break;
+ default:
+ break;
+ }
+ if (icmph->code > NR_ICMP_UNREACH)
+ goto out;
+ } else if (icmph->type == ICMP_PARAMETERPROB)
+ info = ntohl(icmph->un.gateway) >> 24;
+
+ /*
+ * Throw it at our lower layers
+ *
+ * RFC 1122: 3.2.2 MUST extract the protocol ID from the passed
+ * header.
+ * RFC 1122: 3.2.2.1 MUST pass ICMP unreach messages to the
+ * transport layer.
+ * RFC 1122: 3.2.2.2 MUST pass ICMP time expired messages to
+ * transport layer.
+ */
+
+ /*
+ * Check the other end isnt violating RFC 1122. Some routers send
+ * bogus responses to broadcast frames. If you see this message
+ * first check your netmask matches at both ends, if it does then
+ * get the other vendor to fix their kit.
+ */
+
+ if (!sysctl_icmp_ignore_bogus_error_responses &&
+ inet_addr_type(iph->daddr) == RTN_BROADCAST) {
+ if (net_ratelimit())
+ printk(KERN_WARNING "%u.%u.%u.%u sent an invalid ICMP "
+ "type %u, code %u "
+ "error to a broadcast: %u.%u.%u.%u on %s\n",
+ NIPQUAD(skb->nh.iph->saddr),
+ icmph->type, icmph->code,
+ NIPQUAD(iph->daddr),
+ skb->dev->name);
+ goto out;
+ }
+
+ /* Checkin full IP header plus 8 bytes of protocol to
+ * avoid additional coding at protocol handlers.
+ */
+ if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
+ goto out;
+
+ iph = (struct iphdr *)skb->data;
+ protocol = iph->protocol;
+
+ /*
+ * Deliver ICMP message to raw sockets. Pretty useless feature?
+ */
+
+ /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
+ hash = protocol & (MAX_INET_PROTOS - 1);
+ read_lock(&raw_v4_lock);
+ if ((raw_sk = sk_head(&raw_v4_htable[hash])) != NULL) {
+ while ((raw_sk = __raw_v4_lookup(raw_sk, protocol, iph->daddr,
+ iph->saddr,
+ skb->dev->ifindex)) != NULL) {
+ raw_err(raw_sk, skb, info);
+ raw_sk = sk_next(raw_sk);
+ iph = (struct iphdr *)skb->data;
+ }
+ }
+ read_unlock(&raw_v4_lock);
+
+ rcu_read_lock();
+ ipprot = rcu_dereference(inet_protos[hash]);
+ if (ipprot && ipprot->err_handler)
+ ipprot->err_handler(skb, info);
+ rcu_read_unlock();
+
+out:
+ return;
+out_err:
+ ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+ goto out;
+}
+
+
+/*
+ * Handle ICMP_REDIRECT.
+ */
+
+static void icmp_redirect(struct sk_buff *skb)
+{
+ struct iphdr *iph;
+ unsigned long ip;
+
+ if (skb->len < sizeof(struct iphdr))
+ goto out_err;
+
+ /*
+ * Get the copied header of the packet that caused the redirect
+ */
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto out;
+
+ iph = (struct iphdr *)skb->data;
+ ip = iph->daddr;
+
+ switch (skb->h.icmph->code & 7) {
+ case ICMP_REDIR_NET:
+ case ICMP_REDIR_NETTOS:
+ /*
+ * As per RFC recommendations now handle it as a host redirect.
+ */
+ case ICMP_REDIR_HOST:
+ case ICMP_REDIR_HOSTTOS:
+ ip_rt_redirect(skb->nh.iph->saddr, ip, skb->h.icmph->un.gateway,
+ iph->saddr, iph->tos, skb->dev);
+ break;
+ }
+out:
+ return;
+out_err:
+ ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+ goto out;
+}
+
+/*
+ * Handle ICMP_ECHO ("ping") requests.
+ *
+ * RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
+ * requests.
+ * RFC 1122: 3.2.2.6 Data received in the ICMP_ECHO request MUST be
+ * included in the reply.
+ * RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring
+ * echo requests, MUST have default=NOT.
+ * See also WRT handling of options once they are done and working.
+ */
+
+static void icmp_echo(struct sk_buff *skb)
+{
+ if (!sysctl_icmp_echo_ignore_all) {
+ struct icmp_bxm icmp_param;
+
+ icmp_param.data.icmph = *skb->h.icmph;
+ icmp_param.data.icmph.type = ICMP_ECHOREPLY;
+ icmp_param.skb = skb;
+ icmp_param.offset = 0;
+ icmp_param.data_len = skb->len;
+ icmp_param.head_len = sizeof(struct icmphdr);
+ icmp_reply(&icmp_param, skb);
+ }
+}
+
+/*
+ * Handle ICMP Timestamp requests.
+ * RFC 1122: 3.2.2.8 MAY implement ICMP timestamp requests.
+ * SHOULD be in the kernel for minimum random latency.
+ * MUST be accurate to a few minutes.
+ * MUST be updated at least at 15Hz.
+ */
+static void icmp_timestamp(struct sk_buff *skb)
+{
+ struct timeval tv;
+ struct icmp_bxm icmp_param;
+ /*
+ * Too short.
+ */
+ if (skb->len < 4)
+ goto out_err;
+
+ /*
+ * Fill in the current time as ms since midnight UT:
+ */
+ do_gettimeofday(&tv);
+ icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * 1000 +
+ tv.tv_usec / 1000);
+ icmp_param.data.times[2] = icmp_param.data.times[1];
+ if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4))
+ BUG();
+ icmp_param.data.icmph = *skb->h.icmph;
+ icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY;
+ icmp_param.data.icmph.code = 0;
+ icmp_param.skb = skb;
+ icmp_param.offset = 0;
+ icmp_param.data_len = 0;
+ icmp_param.head_len = sizeof(struct icmphdr) + 12;
+ icmp_reply(&icmp_param, skb);
+out:
+ return;
+out_err:
+ ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+ goto out;
+}
+
+
+/*
+ * Handle ICMP_ADDRESS_MASK requests. (RFC950)
+ *
+ * RFC1122 (3.2.2.9). A host MUST only send replies to
+ * ADDRESS_MASK requests if it's been configured as an address mask
+ * agent. Receiving a request doesn't constitute implicit permission to
+ * act as one. Of course, implementing this correctly requires (SHOULD)
+ * a way to turn the functionality on and off. Another one for sysctl(),
+ * I guess. -- MS
+ *
+ * RFC1812 (4.3.3.9). A router MUST implement it.
+ * A router SHOULD have switch turning it on/off.
+ * This switch MUST be ON by default.
+ *
+ * Gratuitous replies, zero-source replies are not implemented,
+ * that complies with RFC. DO NOT implement them!!! All the idea
+ * of broadcast addrmask replies as specified in RFC950 is broken.
+ * The problem is that it is not uncommon to have several prefixes
+ * on one physical interface. Moreover, addrmask agent can even be
+ * not aware of existing another prefixes.
+ * If source is zero, addrmask agent cannot choose correct prefix.
+ * Gratuitous mask announcements suffer from the same problem.
+ * RFC1812 explains it, but still allows to use ADDRMASK,
+ * that is pretty silly. --ANK
+ *
+ * All these rules are so bizarre, that I removed kernel addrmask
+ * support at all. It is wrong, it is obsolete, nobody uses it in
+ * any case. --ANK
+ *
+ * Furthermore you can do it with a usermode address agent program
+ * anyway...
+ */
+
+static void icmp_address(struct sk_buff *skb)
+{
+#if 0
+ if (net_ratelimit())
+ printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n");
+#endif
+}
+
+/*
+ * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain
+ * loudly if an inconsistency is found.
+ */
+
+static void icmp_address_reply(struct sk_buff *skb)
+{
+ struct rtable *rt = (struct rtable *)skb->dst;
+ struct net_device *dev = skb->dev;
+ struct in_device *in_dev;
+ struct in_ifaddr *ifa;
+
+ if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC))
+ goto out;
+
+ in_dev = in_dev_get(dev);
+ if (!in_dev)
+ goto out;
+ rcu_read_lock();
+ if (in_dev->ifa_list &&
+ IN_DEV_LOG_MARTIANS(in_dev) &&
+ IN_DEV_FORWARD(in_dev)) {
+ u32 _mask, *mp;
+
+ mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask);
+ if (mp == NULL)
+ BUG();
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ if (*mp == ifa->ifa_mask &&
+ inet_ifa_match(rt->rt_src, ifa))
+ break;
+ }
+ if (!ifa && net_ratelimit()) {
+ printk(KERN_INFO "Wrong address mask %u.%u.%u.%u from "
+ "%s/%u.%u.%u.%u\n",
+ NIPQUAD(*mp), dev->name, NIPQUAD(rt->rt_src));
+ }
+ }
+ rcu_read_unlock();
+ in_dev_put(in_dev);
+out:;
+}
+
+static void icmp_discard(struct sk_buff *skb)
+{
+}
+
+/*
+ * Deal with incoming ICMP packets.
+ */
+int icmp_rcv(struct sk_buff *skb)
+{
+ struct icmphdr *icmph;
+ struct rtable *rt = (struct rtable *)skb->dst;
+
+ ICMP_INC_STATS_BH(ICMP_MIB_INMSGS);
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_HW:
+ if (!(u16)csum_fold(skb->csum))
+ break;
+ NETDEBUG(if (net_ratelimit())
+ printk(KERN_DEBUG "icmp v4 hw csum failure\n"));
+ case CHECKSUM_NONE:
+ if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))
+ goto error;
+ default:;
+ }
+
+ if (!pskb_pull(skb, sizeof(struct icmphdr)))
+ goto error;
+
+ icmph = skb->h.icmph;
+
+ /*
+ * 18 is the highest 'known' ICMP type. Anything else is a mystery
+ *
+ * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
+ * discarded.
+ */
+ if (icmph->type > NR_ICMP_TYPES)
+ goto error;
+
+
+ /*
+ * Parse the ICMP message
+ */
+
+ if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
+ /*
+ * RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be
+ * silently ignored (we let user decide with a sysctl).
+ * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
+ * discarded if to broadcast/multicast.
+ */
+ if (icmph->type == ICMP_ECHO &&
+ sysctl_icmp_echo_ignore_broadcasts) {
+ goto error;
+ }
+ if (icmph->type != ICMP_ECHO &&
+ icmph->type != ICMP_TIMESTAMP &&
+ icmph->type != ICMP_ADDRESS &&
+ icmph->type != ICMP_ADDRESSREPLY) {
+ goto error;
+ }
+ }
+
+ ICMP_INC_STATS_BH(icmp_pointers[icmph->type].input_entry);
+ icmp_pointers[icmph->type].handler(skb);
+
+drop:
+ kfree_skb(skb);
+ return 0;
+error:
+ ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+ goto drop;
+}
+
+/*
+ * This table is the definition of how we handle ICMP.
+ */
+static struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
+ [ICMP_ECHOREPLY] = {
+ .output_entry = ICMP_MIB_OUTECHOREPS,
+ .input_entry = ICMP_MIB_INECHOREPS,
+ .handler = icmp_discard,
+ },
+ [1] = {
+ .output_entry = ICMP_MIB_DUMMY,
+ .input_entry = ICMP_MIB_INERRORS,
+ .handler = icmp_discard,
+ .error = 1,
+ },
+ [2] = {
+ .output_entry = ICMP_MIB_DUMMY,
+ .input_entry = ICMP_MIB_INERRORS,
+ .handler = icmp_discard,
+ .error = 1,
+ },
+ [ICMP_DEST_UNREACH] = {
+ .output_entry = ICMP_MIB_OUTDESTUNREACHS,
+ .input_entry = ICMP_MIB_INDESTUNREACHS,
+ .handler = icmp_unreach,
+ .error = 1,
+ },
+ [ICMP_SOURCE_QUENCH] = {
+ .output_entry = ICMP_MIB_OUTSRCQUENCHS,
+ .input_entry = ICMP_MIB_INSRCQUENCHS,
+ .handler = icmp_unreach,
+ .error = 1,
+ },
+ [ICMP_REDIRECT] = {
+ .output_entry = ICMP_MIB_OUTREDIRECTS,
+ .input_entry = ICMP_MIB_INREDIRECTS,
+ .handler = icmp_redirect,
+ .error = 1,
+ },
+ [6] = {
+ .output_entry = ICMP_MIB_DUMMY,
+ .input_entry = ICMP_MIB_INERRORS,
+ .handler = icmp_discard,
+ .error = 1,
+ },
+ [7] = {
+ .output_entry = ICMP_MIB_DUMMY,
+ .input_entry = ICMP_MIB_INERRORS,
+ .handler = icmp_discard,
+ .error = 1,
+ },
+ [ICMP_ECHO] = {
+ .output_entry = ICMP_MIB_OUTECHOS,
+ .input_entry = ICMP_MIB_INECHOS,
+ .handler = icmp_echo,
+ },
+ [9] = {
+ .output_entry = ICMP_MIB_DUMMY,
+ .input_entry = ICMP_MIB_INERRORS,
+ .handler = icmp_discard,
+ .error = 1,
+ },
+ [10] = {
+ .output_entry = ICMP_MIB_DUMMY,
+ .input_entry = ICMP_MIB_INERRORS,
+ .handler = icmp_discard,
+ .error = 1,
+ },
+ [ICMP_TIME_EXCEEDED] = {
+ .output_entry = ICMP_MIB_OUTTIMEEXCDS,
+ .input_entry = ICMP_MIB_INTIMEEXCDS,
+ .handler = icmp_unreach,
+ .error = 1,
+ },
+ [ICMP_PARAMETERPROB] = {
+ .output_entry = ICMP_MIB_OUTPARMPROBS,
+ .input_entry = ICMP_MIB_INPARMPROBS,
+ .handler = icmp_unreach,
+ .error = 1,
+ },
+ [ICMP_TIMESTAMP] = {
+ .output_entry = ICMP_MIB_OUTTIMESTAMPS,
+ .input_entry = ICMP_MIB_INTIMESTAMPS,
+ .handler = icmp_timestamp,
+ },
+ [ICMP_TIMESTAMPREPLY] = {
+ .output_entry = ICMP_MIB_OUTTIMESTAMPREPS,
+ .input_entry = ICMP_MIB_INTIMESTAMPREPS,
+ .handler = icmp_discard,
+ },
+ [ICMP_INFO_REQUEST] = {
+ .output_entry = ICMP_MIB_DUMMY,
+ .input_entry = ICMP_MIB_DUMMY,
+ .handler = icmp_discard,
+ },
+ [ICMP_INFO_REPLY] = {
+ .output_entry = ICMP_MIB_DUMMY,
+ .input_entry = ICMP_MIB_DUMMY,
+ .handler = icmp_discard,
+ },
+ [ICMP_ADDRESS] = {
+ .output_entry = ICMP_MIB_OUTADDRMASKS,
+ .input_entry = ICMP_MIB_INADDRMASKS,
+ .handler = icmp_address,
+ },
+ [ICMP_ADDRESSREPLY] = {
+ .output_entry = ICMP_MIB_OUTADDRMASKREPS,
+ .input_entry = ICMP_MIB_INADDRMASKREPS,
+ .handler = icmp_address_reply,
+ },
+};
+
+void __init icmp_init(struct net_proto_family *ops)
+{
+ struct inet_sock *inet;
+ int i;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ int err;
+
+ if (!cpu_possible(i))
+ continue;
+
+ err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_ICMP,
+ &per_cpu(__icmp_socket, i));
+
+ if (err < 0)
+ panic("Failed to create the ICMP control socket.\n");
+
+ per_cpu(__icmp_socket, i)->sk->sk_allocation = GFP_ATOMIC;
+
+ /* Enough space for 2 64K ICMP packets, including
+ * sk_buff struct overhead.
+ */
+ per_cpu(__icmp_socket, i)->sk->sk_sndbuf =
+ (2 * ((64 * 1024) + sizeof(struct sk_buff)));
+
+ inet = inet_sk(per_cpu(__icmp_socket, i)->sk);
+ inet->uc_ttl = -1;
+ inet->pmtudisc = IP_PMTUDISC_DONT;
+
+ /* Unhash it so that IP input processing does not even
+ * see it, we do not wish this socket to see incoming
+ * packets.
+ */
+ per_cpu(__icmp_socket, i)->sk->sk_prot->unhash(per_cpu(__icmp_socket, i)->sk);
+ }
+}
+
+EXPORT_SYMBOL(icmp_err_convert);
+EXPORT_SYMBOL(icmp_send);
+EXPORT_SYMBOL(icmp_statistics);
+EXPORT_SYMBOL(xrlim_allow);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
new file mode 100644
index 000000000000..1f3183168a90
--- /dev/null
+++ b/net/ipv4/igmp.c
@@ -0,0 +1,2473 @@
+/*
+ * Linux NET3: Internet Group Management Protocol [IGMP]
+ *
+ * This code implements the IGMP protocol as defined in RFC1112. There has
+ * been a further revision of this protocol since which is now supported.
+ *
+ * If you have trouble with this module be careful what gcc you have used,
+ * the older version didn't come out right using gcc 2.5.8, the newer one
+ * seems to fall out with gcc 2.6.2.
+ *
+ * Version: $Id: igmp.c,v 1.47 2002/02/01 22:01:03 davem Exp $
+ *
+ * Authors:
+ * Alan Cox <Alan.Cox@linux.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ *
+ * Alan Cox : Added lots of __inline__ to optimise
+ * the memory usage of all the tiny little
+ * functions.
+ * Alan Cox : Dumped the header building experiment.
+ * Alan Cox : Minor tweaks ready for multicast routing
+ * and extended IGMP protocol.
+ * Alan Cox : Removed a load of inline directives. Gcc 2.5.8
+ * writes utterly bogus code otherwise (sigh)
+ * fixed IGMP loopback to behave in the manner
+ * desired by mrouted, fixed the fact it has been
+ * broken since 1.3.6 and cleaned up a few minor
+ * points.
+ *
+ * Chih-Jen Chang : Tried to revise IGMP to Version 2
+ * Tsu-Sheng Tsao E-mail: chihjenc@scf.usc.edu and tsusheng@scf.usc.edu
+ * The enhancements are mainly based on Steve Deering's
+ * ipmulti-3.5 source code.
+ * Chih-Jen Chang : Added the igmp_get_mrouter_info and
+ * Tsu-Sheng Tsao igmp_set_mrouter_info to keep track of
+ * the mrouted version on that device.
+ * Chih-Jen Chang : Added the max_resp_time parameter to
+ * Tsu-Sheng Tsao igmp_heard_query(). Using this parameter
+ * to identify the multicast router version
+ * and do what the IGMP version 2 specified.
+ * Chih-Jen Chang : Added a timer to revert to IGMP V2 router
+ * Tsu-Sheng Tsao if the specified time expired.
+ * Alan Cox : Stop IGMP from 0.0.0.0 being accepted.
+ * Alan Cox : Use GFP_ATOMIC in the right places.
+ * Christian Daudt : igmp timer wasn't set for local group
+ * memberships but was being deleted,
+ * which caused a "del_timer() called
+ * from %p with timer not initialized\n"
+ * message (960131).
+ * Christian Daudt : removed del_timer from
+ * igmp_timer_expire function (960205).
+ * Christian Daudt : igmp_heard_report now only calls
+ * igmp_timer_expire if tm->running is
+ * true (960216).
+ * Malcolm Beattie : ttl comparison wrong in igmp_rcv made
+ * igmp_heard_query never trigger. Expiry
+ * miscalculation fixed in igmp_heard_query
+ * and random() made to return unsigned to
+ * prevent negative expiry times.
+ * Alexey Kuznetsov: Wrong group leaving behaviour, backport
+ * fix from pending 2.1.x patches.
+ * Alan Cox: Forget to enable FDDI support earlier.
+ * Alexey Kuznetsov: Fixed leaving groups on device down.
+ * Alexey Kuznetsov: Accordance to igmp-v2-06 draft.
+ * David L Stevens: IGMPv3 support, with help from
+ * Vinay Kulkarni
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/if_arp.h>
+#include <linux/rtnetlink.h>
+#include <linux/times.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/sock.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#ifdef CONFIG_IP_MROUTE
+#include <linux/mroute.h>
+#endif
+#ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#endif
+
+#define IP_MAX_MEMBERSHIPS 20
+#define IP_MAX_MSF 10
+
+#ifdef CONFIG_IP_MULTICAST
+/* Parameter names and values are taken from igmp-v2-06 draft */
+
+#define IGMP_V1_Router_Present_Timeout (400*HZ)
+#define IGMP_V2_Router_Present_Timeout (400*HZ)
+#define IGMP_Unsolicited_Report_Interval (10*HZ)
+#define IGMP_Query_Response_Interval (10*HZ)
+#define IGMP_Unsolicited_Report_Count 2
+
+
+#define IGMP_Initial_Report_Delay (1)
+
+/* IGMP_Initial_Report_Delay is not from IGMP specs!
+ * IGMP specs require to report membership immediately after
+ * joining a group, but we delay the first report by a
+ * small interval. It seems more natural and still does not
+ * contradict to specs provided this delay is small enough.
+ */
+
+#define IGMP_V1_SEEN(in_dev) (ipv4_devconf.force_igmp_version == 1 || \
+ (in_dev)->cnf.force_igmp_version == 1 || \
+ ((in_dev)->mr_v1_seen && \
+ time_before(jiffies, (in_dev)->mr_v1_seen)))
+#define IGMP_V2_SEEN(in_dev) (ipv4_devconf.force_igmp_version == 2 || \
+ (in_dev)->cnf.force_igmp_version == 2 || \
+ ((in_dev)->mr_v2_seen && \
+ time_before(jiffies, (in_dev)->mr_v2_seen)))
+
+static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im);
+static void igmpv3_del_delrec(struct in_device *in_dev, __u32 multiaddr);
+static void igmpv3_clear_delrec(struct in_device *in_dev);
+static int sf_setstate(struct ip_mc_list *pmc);
+static void sf_markstate(struct ip_mc_list *pmc);
+#endif
+static void ip_mc_clear_src(struct ip_mc_list *pmc);
+static int ip_mc_add_src(struct in_device *in_dev, __u32 *pmca, int sfmode,
+ int sfcount, __u32 *psfsrc, int delta);
+
+static void ip_ma_put(struct ip_mc_list *im)
+{
+ if (atomic_dec_and_test(&im->refcnt)) {
+ in_dev_put(im->interface);
+ kfree(im);
+ }
+}
+
+#ifdef CONFIG_IP_MULTICAST
+
+/*
+ * Timer management
+ */
+
+static __inline__ void igmp_stop_timer(struct ip_mc_list *im)
+{
+ spin_lock_bh(&im->lock);
+ if (del_timer(&im->timer))
+ atomic_dec(&im->refcnt);
+ im->tm_running=0;
+ im->reporter = 0;
+ im->unsolicit_count = 0;
+ spin_unlock_bh(&im->lock);
+}
+
+/* It must be called with locked im->lock */
+static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
+{
+ int tv=net_random() % max_delay;
+
+ im->tm_running=1;
+ if (!mod_timer(&im->timer, jiffies+tv+2))
+ atomic_inc(&im->refcnt);
+}
+
+static void igmp_gq_start_timer(struct in_device *in_dev)
+{
+ int tv = net_random() % in_dev->mr_maxdelay;
+
+ in_dev->mr_gq_running = 1;
+ if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2))
+ in_dev_hold(in_dev);
+}
+
+static void igmp_ifc_start_timer(struct in_device *in_dev, int delay)
+{
+ int tv = net_random() % delay;
+
+ if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2))
+ in_dev_hold(in_dev);
+}
+
+static void igmp_mod_timer(struct ip_mc_list *im, int max_delay)
+{
+ spin_lock_bh(&im->lock);
+ im->unsolicit_count = 0;
+ if (del_timer(&im->timer)) {
+ if ((long)(im->timer.expires-jiffies) < max_delay) {
+ add_timer(&im->timer);
+ im->tm_running=1;
+ spin_unlock_bh(&im->lock);
+ return;
+ }
+ atomic_dec(&im->refcnt);
+ }
+ igmp_start_timer(im, max_delay);
+ spin_unlock_bh(&im->lock);
+}
+
+
+/*
+ * Send an IGMP report.
+ */
+
+#define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4)
+
+
+static int is_in(struct ip_mc_list *pmc, struct ip_sf_list *psf, int type,
+ int gdeleted, int sdeleted)
+{
+ switch (type) {
+ case IGMPV3_MODE_IS_INCLUDE:
+ case IGMPV3_MODE_IS_EXCLUDE:
+ if (gdeleted || sdeleted)
+ return 0;
+ return !(pmc->gsquery && !psf->sf_gsresp);
+ case IGMPV3_CHANGE_TO_INCLUDE:
+ if (gdeleted || sdeleted)
+ return 0;
+ return psf->sf_count[MCAST_INCLUDE] != 0;
+ case IGMPV3_CHANGE_TO_EXCLUDE:
+ if (gdeleted || sdeleted)
+ return 0;
+ if (pmc->sfcount[MCAST_EXCLUDE] == 0 ||
+ psf->sf_count[MCAST_INCLUDE])
+ return 0;
+ return pmc->sfcount[MCAST_EXCLUDE] ==
+ psf->sf_count[MCAST_EXCLUDE];
+ case IGMPV3_ALLOW_NEW_SOURCES:
+ if (gdeleted || !psf->sf_crcount)
+ return 0;
+ return (pmc->sfmode == MCAST_INCLUDE) ^ sdeleted;
+ case IGMPV3_BLOCK_OLD_SOURCES:
+ if (pmc->sfmode == MCAST_INCLUDE)
+ return gdeleted || (psf->sf_crcount && sdeleted);
+ return psf->sf_crcount && !gdeleted && !sdeleted;
+ }
+ return 0;
+}
+
+static int
+igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
+{
+ struct ip_sf_list *psf;
+ int scount = 0;
+
+ for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ if (!is_in(pmc, psf, type, gdeleted, sdeleted))
+ continue;
+ scount++;
+ }
+ return scount;
+}
+
+static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
+{
+ struct sk_buff *skb;
+ struct rtable *rt;
+ struct iphdr *pip;
+ struct igmpv3_report *pig;
+
+ skb = alloc_skb(size + LL_RESERVED_SPACE(dev), GFP_ATOMIC);
+ if (skb == NULL)
+ return NULL;
+
+ {
+ struct flowi fl = { .oif = dev->ifindex,
+ .nl_u = { .ip4_u = {
+ .daddr = IGMPV3_ALL_MCR } },
+ .proto = IPPROTO_IGMP };
+ if (ip_route_output_key(&rt, &fl)) {
+ kfree_skb(skb);
+ return NULL;
+ }
+ }
+ if (rt->rt_src == 0) {
+ kfree_skb(skb);
+ ip_rt_put(rt);
+ return NULL;
+ }
+
+ skb->dst = &rt->u.dst;
+ skb->dev = dev;
+
+ skb_reserve(skb, LL_RESERVED_SPACE(dev));
+
+ skb->nh.iph = pip =(struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4);
+
+ pip->version = 4;
+ pip->ihl = (sizeof(struct iphdr)+4)>>2;
+ pip->tos = 0xc0;
+ pip->frag_off = htons(IP_DF);
+ pip->ttl = 1;
+ pip->daddr = rt->rt_dst;
+ pip->saddr = rt->rt_src;
+ pip->protocol = IPPROTO_IGMP;
+ pip->tot_len = 0; /* filled in later */
+ ip_select_ident(pip, &rt->u.dst, NULL);
+ ((u8*)&pip[1])[0] = IPOPT_RA;
+ ((u8*)&pip[1])[1] = 4;
+ ((u8*)&pip[1])[2] = 0;
+ ((u8*)&pip[1])[3] = 0;
+
+ pig =(struct igmpv3_report *)skb_put(skb, sizeof(*pig));
+ skb->h.igmph = (struct igmphdr *)pig;
+ pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT;
+ pig->resv1 = 0;
+ pig->csum = 0;
+ pig->resv2 = 0;
+ pig->ngrec = 0;
+ return skb;
+}
+
+static int igmpv3_sendpack(struct sk_buff *skb)
+{
+ struct iphdr *pip = skb->nh.iph;
+ struct igmphdr *pig = skb->h.igmph;
+ int iplen, igmplen;
+
+ iplen = skb->tail - (unsigned char *)skb->nh.iph;
+ pip->tot_len = htons(iplen);
+ ip_send_check(pip);
+
+ igmplen = skb->tail - (unsigned char *)skb->h.igmph;
+ pig->csum = ip_compute_csum((void *)skb->h.igmph, igmplen);
+
+ return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dev,
+ dst_output);
+}
+
+static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
+{
+ return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc,type,gdel,sdel);
+}
+
+static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
+ int type, struct igmpv3_grec **ppgr)
+{
+ struct net_device *dev = pmc->interface->dev;
+ struct igmpv3_report *pih;
+ struct igmpv3_grec *pgr;
+
+ if (!skb)
+ skb = igmpv3_newpack(dev, dev->mtu);
+ if (!skb)
+ return NULL;
+ pgr = (struct igmpv3_grec *)skb_put(skb, sizeof(struct igmpv3_grec));
+ pgr->grec_type = type;
+ pgr->grec_auxwords = 0;
+ pgr->grec_nsrcs = 0;
+ pgr->grec_mca = pmc->multiaddr;
+ pih = (struct igmpv3_report *)skb->h.igmph;
+ pih->ngrec = htons(ntohs(pih->ngrec)+1);
+ *ppgr = pgr;
+ return skb;
+}
+
+#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \
+ skb_tailroom(skb)) : 0)
+
+static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
+ int type, int gdeleted, int sdeleted)
+{
+ struct net_device *dev = pmc->interface->dev;
+ struct igmpv3_report *pih;
+ struct igmpv3_grec *pgr = NULL;
+ struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
+ int scount, first, isquery, truncate;
+
+ if (pmc->multiaddr == IGMP_ALL_HOSTS)
+ return skb;
+
+ isquery = type == IGMPV3_MODE_IS_INCLUDE ||
+ type == IGMPV3_MODE_IS_EXCLUDE;
+ truncate = type == IGMPV3_MODE_IS_EXCLUDE ||
+ type == IGMPV3_CHANGE_TO_EXCLUDE;
+
+ psf_list = sdeleted ? &pmc->tomb : &pmc->sources;
+
+ if (!*psf_list) {
+ if (type == IGMPV3_ALLOW_NEW_SOURCES ||
+ type == IGMPV3_BLOCK_OLD_SOURCES)
+ return skb;
+ if (pmc->crcount || isquery) {
+ /* make sure we have room for group header and at
+ * least one source.
+ */
+ if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)+
+ sizeof(__u32)) {
+ igmpv3_sendpack(skb);
+ skb = NULL; /* add_grhead will get a new one */
+ }
+ skb = add_grhead(skb, pmc, type, &pgr);
+ }
+ return skb;
+ }
+ pih = skb ? (struct igmpv3_report *)skb->h.igmph : NULL;
+
+ /* EX and TO_EX get a fresh packet, if needed */
+ if (truncate) {
+ if (pih && pih->ngrec &&
+ AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
+ if (skb)
+ igmpv3_sendpack(skb);
+ skb = igmpv3_newpack(dev, dev->mtu);
+ }
+ }
+ first = 1;
+ scount = 0;
+ psf_prev = NULL;
+ for (psf=*psf_list; psf; psf=psf_next) {
+ u32 *psrc;
+
+ psf_next = psf->sf_next;
+
+ if (!is_in(pmc, psf, type, gdeleted, sdeleted)) {
+ psf_prev = psf;
+ continue;
+ }
+
+ /* clear marks on query responses */
+ if (isquery)
+ psf->sf_gsresp = 0;
+
+ if (AVAILABLE(skb) < sizeof(u32) +
+ first*sizeof(struct igmpv3_grec)) {
+ if (truncate && !first)
+ break; /* truncate these */
+ if (pgr)
+ pgr->grec_nsrcs = htons(scount);
+ if (skb)
+ igmpv3_sendpack(skb);
+ skb = igmpv3_newpack(dev, dev->mtu);
+ first = 1;
+ scount = 0;
+ }
+ if (first) {
+ skb = add_grhead(skb, pmc, type, &pgr);
+ first = 0;
+ }
+ psrc = (u32 *)skb_put(skb, sizeof(u32));
+ *psrc = psf->sf_inaddr;
+ scount++;
+ if ((type == IGMPV3_ALLOW_NEW_SOURCES ||
+ type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
+ psf->sf_crcount--;
+ if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
+ if (psf_prev)
+ psf_prev->sf_next = psf->sf_next;
+ else
+ *psf_list = psf->sf_next;
+ kfree(psf);
+ continue;
+ }
+ }
+ psf_prev = psf;
+ }
+ if (pgr)
+ pgr->grec_nsrcs = htons(scount);
+
+ if (isquery)
+ pmc->gsquery = 0; /* clear query state on report */
+ return skb;
+}
+
+static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
+{
+ struct sk_buff *skb = NULL;
+ int type;
+
+ if (!pmc) {
+ read_lock(&in_dev->mc_list_lock);
+ for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+ if (pmc->multiaddr == IGMP_ALL_HOSTS)
+ continue;
+ spin_lock_bh(&pmc->lock);
+ if (pmc->sfcount[MCAST_EXCLUDE])
+ type = IGMPV3_MODE_IS_EXCLUDE;
+ else
+ type = IGMPV3_MODE_IS_INCLUDE;
+ skb = add_grec(skb, pmc, type, 0, 0);
+ spin_unlock_bh(&pmc->lock);
+ }
+ read_unlock(&in_dev->mc_list_lock);
+ } else {
+ spin_lock_bh(&pmc->lock);
+ if (pmc->sfcount[MCAST_EXCLUDE])
+ type = IGMPV3_MODE_IS_EXCLUDE;
+ else
+ type = IGMPV3_MODE_IS_INCLUDE;
+ skb = add_grec(skb, pmc, type, 0, 0);
+ spin_unlock_bh(&pmc->lock);
+ }
+ if (!skb)
+ return 0;
+ return igmpv3_sendpack(skb);
+}
+
+/*
+ * remove zero-count source records from a source filter list
+ */
+static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
+{
+ struct ip_sf_list *psf_prev, *psf_next, *psf;
+
+ psf_prev = NULL;
+ for (psf=*ppsf; psf; psf = psf_next) {
+ psf_next = psf->sf_next;
+ if (psf->sf_crcount == 0) {
+ if (psf_prev)
+ psf_prev->sf_next = psf->sf_next;
+ else
+ *ppsf = psf->sf_next;
+ kfree(psf);
+ } else
+ psf_prev = psf;
+ }
+}
+
+static void igmpv3_send_cr(struct in_device *in_dev)
+{
+ struct ip_mc_list *pmc, *pmc_prev, *pmc_next;
+ struct sk_buff *skb = NULL;
+ int type, dtype;
+
+ read_lock(&in_dev->mc_list_lock);
+ spin_lock_bh(&in_dev->mc_tomb_lock);
+
+ /* deleted MCA's */
+ pmc_prev = NULL;
+ for (pmc=in_dev->mc_tomb; pmc; pmc=pmc_next) {
+ pmc_next = pmc->next;
+ if (pmc->sfmode == MCAST_INCLUDE) {
+ type = IGMPV3_BLOCK_OLD_SOURCES;
+ dtype = IGMPV3_BLOCK_OLD_SOURCES;
+ skb = add_grec(skb, pmc, type, 1, 0);
+ skb = add_grec(skb, pmc, dtype, 1, 1);
+ }
+ if (pmc->crcount) {
+ pmc->crcount--;
+ if (pmc->sfmode == MCAST_EXCLUDE) {
+ type = IGMPV3_CHANGE_TO_INCLUDE;
+ skb = add_grec(skb, pmc, type, 1, 0);
+ }
+ if (pmc->crcount == 0) {
+ igmpv3_clear_zeros(&pmc->tomb);
+ igmpv3_clear_zeros(&pmc->sources);
+ }
+ }
+ if (pmc->crcount == 0 && !pmc->tomb && !pmc->sources) {
+ if (pmc_prev)
+ pmc_prev->next = pmc_next;
+ else
+ in_dev->mc_tomb = pmc_next;
+ in_dev_put(pmc->interface);
+ kfree(pmc);
+ } else
+ pmc_prev = pmc;
+ }
+ spin_unlock_bh(&in_dev->mc_tomb_lock);
+
+ /* change recs */
+ for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+ spin_lock_bh(&pmc->lock);
+ if (pmc->sfcount[MCAST_EXCLUDE]) {
+ type = IGMPV3_BLOCK_OLD_SOURCES;
+ dtype = IGMPV3_ALLOW_NEW_SOURCES;
+ } else {
+ type = IGMPV3_ALLOW_NEW_SOURCES;
+ dtype = IGMPV3_BLOCK_OLD_SOURCES;
+ }
+ skb = add_grec(skb, pmc, type, 0, 0);
+ skb = add_grec(skb, pmc, dtype, 0, 1); /* deleted sources */
+
+ /* filter mode changes */
+ if (pmc->crcount) {
+ pmc->crcount--;
+ if (pmc->sfmode == MCAST_EXCLUDE)
+ type = IGMPV3_CHANGE_TO_EXCLUDE;
+ else
+ type = IGMPV3_CHANGE_TO_INCLUDE;
+ skb = add_grec(skb, pmc, type, 0, 0);
+ }
+ spin_unlock_bh(&pmc->lock);
+ }
+ read_unlock(&in_dev->mc_list_lock);
+
+ if (!skb)
+ return;
+ (void) igmpv3_sendpack(skb);
+}
+
+static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
+ int type)
+{
+ struct sk_buff *skb;
+ struct iphdr *iph;
+ struct igmphdr *ih;
+ struct rtable *rt;
+ struct net_device *dev = in_dev->dev;
+ u32 group = pmc ? pmc->multiaddr : 0;
+ u32 dst;
+
+ if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
+ return igmpv3_send_report(in_dev, pmc);
+ else if (type == IGMP_HOST_LEAVE_MESSAGE)
+ dst = IGMP_ALL_ROUTER;
+ else
+ dst = group;
+
+ {
+ struct flowi fl = { .oif = dev->ifindex,
+ .nl_u = { .ip4_u = { .daddr = dst } },
+ .proto = IPPROTO_IGMP };
+ if (ip_route_output_key(&rt, &fl))
+ return -1;
+ }
+ if (rt->rt_src == 0) {
+ ip_rt_put(rt);
+ return -1;
+ }
+
+ skb=alloc_skb(IGMP_SIZE+LL_RESERVED_SPACE(dev), GFP_ATOMIC);
+ if (skb == NULL) {
+ ip_rt_put(rt);
+ return -1;
+ }
+
+ skb->dst = &rt->u.dst;
+
+ skb_reserve(skb, LL_RESERVED_SPACE(dev));
+
+ skb->nh.iph = iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4);
+
+ iph->version = 4;
+ iph->ihl = (sizeof(struct iphdr)+4)>>2;
+ iph->tos = 0xc0;
+ iph->frag_off = htons(IP_DF);
+ iph->ttl = 1;
+ iph->daddr = dst;
+ iph->saddr = rt->rt_src;
+ iph->protocol = IPPROTO_IGMP;
+ iph->tot_len = htons(IGMP_SIZE);
+ ip_select_ident(iph, &rt->u.dst, NULL);
+ ((u8*)&iph[1])[0] = IPOPT_RA;
+ ((u8*)&iph[1])[1] = 4;
+ ((u8*)&iph[1])[2] = 0;
+ ((u8*)&iph[1])[3] = 0;
+ ip_send_check(iph);
+
+ ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
+ ih->type=type;
+ ih->code=0;
+ ih->csum=0;
+ ih->group=group;
+ ih->csum=ip_compute_csum((void *)ih, sizeof(struct igmphdr));
+
+ return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+ dst_output);
+}
+
+static void igmp_gq_timer_expire(unsigned long data)
+{
+ struct in_device *in_dev = (struct in_device *)data;
+
+ in_dev->mr_gq_running = 0;
+ igmpv3_send_report(in_dev, NULL);
+ __in_dev_put(in_dev);
+}
+
+static void igmp_ifc_timer_expire(unsigned long data)
+{
+ struct in_device *in_dev = (struct in_device *)data;
+
+ igmpv3_send_cr(in_dev);
+ if (in_dev->mr_ifc_count) {
+ in_dev->mr_ifc_count--;
+ igmp_ifc_start_timer(in_dev, IGMP_Unsolicited_Report_Interval);
+ }
+ __in_dev_put(in_dev);
+}
+
+static void igmp_ifc_event(struct in_device *in_dev)
+{
+ if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
+ return;
+ in_dev->mr_ifc_count = in_dev->mr_qrv ? in_dev->mr_qrv :
+ IGMP_Unsolicited_Report_Count;
+ igmp_ifc_start_timer(in_dev, 1);
+}
+
+
+static void igmp_timer_expire(unsigned long data)
+{
+ struct ip_mc_list *im=(struct ip_mc_list *)data;
+ struct in_device *in_dev = im->interface;
+
+ spin_lock(&im->lock);
+ im->tm_running=0;
+
+ if (im->unsolicit_count) {
+ im->unsolicit_count--;
+ igmp_start_timer(im, IGMP_Unsolicited_Report_Interval);
+ }
+ im->reporter = 1;
+ spin_unlock(&im->lock);
+
+ if (IGMP_V1_SEEN(in_dev))
+ igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT);
+ else if (IGMP_V2_SEEN(in_dev))
+ igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT);
+ else
+ igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT);
+
+ ip_ma_put(im);
+}
+
+static void igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __u32 *srcs)
+{
+ struct ip_sf_list *psf;
+ int i, scount;
+
+ scount = 0;
+ for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ if (scount == nsrcs)
+ break;
+ for (i=0; i<nsrcs; i++)
+ if (srcs[i] == psf->sf_inaddr) {
+ psf->sf_gsresp = 1;
+ scount++;
+ break;
+ }
+ }
+}
+
+static void igmp_heard_report(struct in_device *in_dev, u32 group)
+{
+ struct ip_mc_list *im;
+
+ /* Timers are only set for non-local groups */
+
+ if (group == IGMP_ALL_HOSTS)
+ return;
+
+ read_lock(&in_dev->mc_list_lock);
+ for (im=in_dev->mc_list; im!=NULL; im=im->next) {
+ if (im->multiaddr == group) {
+ igmp_stop_timer(im);
+ break;
+ }
+ }
+ read_unlock(&in_dev->mc_list_lock);
+}
+
+static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
+ int len)
+{
+ struct igmphdr *ih = skb->h.igmph;
+ struct igmpv3_query *ih3 = (struct igmpv3_query *)ih;
+ struct ip_mc_list *im;
+ u32 group = ih->group;
+ int max_delay;
+ int mark = 0;
+
+
+ if (len == 8) {
+ if (ih->code == 0) {
+ /* Alas, old v1 router presents here. */
+
+ max_delay = IGMP_Query_Response_Interval;
+ in_dev->mr_v1_seen = jiffies +
+ IGMP_V1_Router_Present_Timeout;
+ group = 0;
+ } else {
+ /* v2 router present */
+ max_delay = ih->code*(HZ/IGMP_TIMER_SCALE);
+ in_dev->mr_v2_seen = jiffies +
+ IGMP_V2_Router_Present_Timeout;
+ }
+ /* cancel the interface change timer */
+ in_dev->mr_ifc_count = 0;
+ if (del_timer(&in_dev->mr_ifc_timer))
+ __in_dev_put(in_dev);
+ /* clear deleted report items */
+ igmpv3_clear_delrec(in_dev);
+ } else if (len < 12) {
+ return; /* ignore bogus packet; freed by caller */
+ } else { /* v3 */
+ if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
+ return;
+
+ ih3 = (struct igmpv3_query *) skb->h.raw;
+ if (ih3->nsrcs) {
+ if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)
+ + ntohs(ih3->nsrcs)*sizeof(__u32)))
+ return;
+ ih3 = (struct igmpv3_query *) skb->h.raw;
+ }
+
+ max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
+ if (!max_delay)
+ max_delay = 1; /* can't mod w/ 0 */
+ in_dev->mr_maxdelay = max_delay;
+ if (ih3->qrv)
+ in_dev->mr_qrv = ih3->qrv;
+ if (!group) { /* general query */
+ if (ih3->nsrcs)
+ return; /* no sources allowed */
+ igmp_gq_start_timer(in_dev);
+ return;
+ }
+ /* mark sources to include, if group & source-specific */
+ mark = ih3->nsrcs != 0;
+ }
+
+ /*
+ * - Start the timers in all of our membership records
+ * that the query applies to for the interface on
+ * which the query arrived excl. those that belong
+ * to a "local" group (224.0.0.X)
+ * - For timers already running check if they need to
+ * be reset.
+ * - Use the igmp->igmp_code field as the maximum
+ * delay possible
+ */
+ read_lock(&in_dev->mc_list_lock);
+ for (im=in_dev->mc_list; im!=NULL; im=im->next) {
+ if (group && group != im->multiaddr)
+ continue;
+ if (im->multiaddr == IGMP_ALL_HOSTS)
+ continue;
+ spin_lock_bh(&im->lock);
+ if (im->tm_running)
+ im->gsquery = im->gsquery && mark;
+ else
+ im->gsquery = mark;
+ if (im->gsquery)
+ igmp_marksources(im, ntohs(ih3->nsrcs), ih3->srcs);
+ spin_unlock_bh(&im->lock);
+ igmp_mod_timer(im, max_delay);
+ }
+ read_unlock(&in_dev->mc_list_lock);
+}
+
+int igmp_rcv(struct sk_buff *skb)
+{
+ /* This basically follows the spec line by line -- see RFC1112 */
+ struct igmphdr *ih;
+ struct in_device *in_dev = in_dev_get(skb->dev);
+ int len = skb->len;
+
+ if (in_dev==NULL) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ if (!pskb_may_pull(skb, sizeof(struct igmphdr)) ||
+ (u16)csum_fold(skb_checksum(skb, 0, len, 0))) {
+ in_dev_put(in_dev);
+ kfree_skb(skb);
+ return 0;
+ }
+
+ ih = skb->h.igmph;
+ switch (ih->type) {
+ case IGMP_HOST_MEMBERSHIP_QUERY:
+ igmp_heard_query(in_dev, skb, len);
+ break;
+ case IGMP_HOST_MEMBERSHIP_REPORT:
+ case IGMPV2_HOST_MEMBERSHIP_REPORT:
+ case IGMPV3_HOST_MEMBERSHIP_REPORT:
+ /* Is it our report looped back? */
+ if (((struct rtable*)skb->dst)->fl.iif == 0)
+ break;
+ igmp_heard_report(in_dev, ih->group);
+ break;
+ case IGMP_PIM:
+#ifdef CONFIG_IP_PIMSM_V1
+ in_dev_put(in_dev);
+ return pim_rcv_v1(skb);
+#endif
+ case IGMP_DVMRP:
+ case IGMP_TRACE:
+ case IGMP_HOST_LEAVE_MESSAGE:
+ case IGMP_MTRACE:
+ case IGMP_MTRACE_RESP:
+ break;
+ default:
+ NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type));
+ }
+ in_dev_put(in_dev);
+ kfree_skb(skb);
+ return 0;
+}
+
+#endif
+
+
+/*
+ * Add a filter to a device
+ */
+
+static void ip_mc_filter_add(struct in_device *in_dev, u32 addr)
+{
+ char buf[MAX_ADDR_LEN];
+ struct net_device *dev = in_dev->dev;
+
+ /* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG.
+ We will get multicast token leakage, when IFF_MULTICAST
+ is changed. This check should be done in dev->set_multicast_list
+ routine. Something sort of:
+ if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; }
+ --ANK
+ */
+ if (arp_mc_map(addr, buf, dev, 0) == 0)
+ dev_mc_add(dev,buf,dev->addr_len,0);
+}
+
+/*
+ * Remove a filter from a device
+ */
+
+static void ip_mc_filter_del(struct in_device *in_dev, u32 addr)
+{
+ char buf[MAX_ADDR_LEN];
+ struct net_device *dev = in_dev->dev;
+
+ if (arp_mc_map(addr, buf, dev, 0) == 0)
+ dev_mc_delete(dev,buf,dev->addr_len,0);
+}
+
+#ifdef CONFIG_IP_MULTICAST
+/*
+ * deleted ip_mc_list manipulation
+ */
+static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
+{
+ struct ip_mc_list *pmc;
+
+ /* this is an "ip_mc_list" for convenience; only the fields below
+ * are actually used. In particular, the refcnt and users are not
+ * used for management of the delete list. Using the same structure
+ * for deleted items allows change reports to use common code with
+ * non-deleted or query-response MCA's.
+ */
+ pmc = (struct ip_mc_list *)kmalloc(sizeof(*pmc), GFP_KERNEL);
+ if (!pmc)
+ return;
+ memset(pmc, 0, sizeof(*pmc));
+ spin_lock_bh(&im->lock);
+ pmc->interface = im->interface;
+ in_dev_hold(in_dev);
+ pmc->multiaddr = im->multiaddr;
+ pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
+ IGMP_Unsolicited_Report_Count;
+ pmc->sfmode = im->sfmode;
+ if (pmc->sfmode == MCAST_INCLUDE) {
+ struct ip_sf_list *psf;
+
+ pmc->tomb = im->tomb;
+ pmc->sources = im->sources;
+ im->tomb = im->sources = NULL;
+ for (psf=pmc->sources; psf; psf=psf->sf_next)
+ psf->sf_crcount = pmc->crcount;
+ }
+ spin_unlock_bh(&im->lock);
+
+ spin_lock_bh(&in_dev->mc_tomb_lock);
+ pmc->next = in_dev->mc_tomb;
+ in_dev->mc_tomb = pmc;
+ spin_unlock_bh(&in_dev->mc_tomb_lock);
+}
+
+static void igmpv3_del_delrec(struct in_device *in_dev, __u32 multiaddr)
+{
+ struct ip_mc_list *pmc, *pmc_prev;
+ struct ip_sf_list *psf, *psf_next;
+
+ spin_lock_bh(&in_dev->mc_tomb_lock);
+ pmc_prev = NULL;
+ for (pmc=in_dev->mc_tomb; pmc; pmc=pmc->next) {
+ if (pmc->multiaddr == multiaddr)
+ break;
+ pmc_prev = pmc;
+ }
+ if (pmc) {
+ if (pmc_prev)
+ pmc_prev->next = pmc->next;
+ else
+ in_dev->mc_tomb = pmc->next;
+ }
+ spin_unlock_bh(&in_dev->mc_tomb_lock);
+ if (pmc) {
+ for (psf=pmc->tomb; psf; psf=psf_next) {
+ psf_next = psf->sf_next;
+ kfree(psf);
+ }
+ in_dev_put(pmc->interface);
+ kfree(pmc);
+ }
+}
+
+static void igmpv3_clear_delrec(struct in_device *in_dev)
+{
+ struct ip_mc_list *pmc, *nextpmc;
+
+ spin_lock_bh(&in_dev->mc_tomb_lock);
+ pmc = in_dev->mc_tomb;
+ in_dev->mc_tomb = NULL;
+ spin_unlock_bh(&in_dev->mc_tomb_lock);
+
+ for (; pmc; pmc = nextpmc) {
+ nextpmc = pmc->next;
+ ip_mc_clear_src(pmc);
+ in_dev_put(pmc->interface);
+ kfree(pmc);
+ }
+ /* clear dead sources, too */
+ read_lock(&in_dev->mc_list_lock);
+ for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+ struct ip_sf_list *psf, *psf_next;
+
+ spin_lock_bh(&pmc->lock);
+ psf = pmc->tomb;
+ pmc->tomb = NULL;
+ spin_unlock_bh(&pmc->lock);
+ for (; psf; psf=psf_next) {
+ psf_next = psf->sf_next;
+ kfree(psf);
+ }
+ }
+ read_unlock(&in_dev->mc_list_lock);
+}
+#endif
+
+static void igmp_group_dropped(struct ip_mc_list *im)
+{
+ struct in_device *in_dev = im->interface;
+#ifdef CONFIG_IP_MULTICAST
+ int reporter;
+#endif
+
+ if (im->loaded) {
+ im->loaded = 0;
+ ip_mc_filter_del(in_dev, im->multiaddr);
+ }
+
+#ifdef CONFIG_IP_MULTICAST
+ if (im->multiaddr == IGMP_ALL_HOSTS)
+ return;
+
+ reporter = im->reporter;
+ igmp_stop_timer(im);
+
+ if (!in_dev->dead) {
+ if (IGMP_V1_SEEN(in_dev))
+ goto done;
+ if (IGMP_V2_SEEN(in_dev)) {
+ if (reporter)
+ igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE);
+ goto done;
+ }
+ /* IGMPv3 */
+ igmpv3_add_delrec(in_dev, im);
+
+ igmp_ifc_event(in_dev);
+ }
+done:
+#endif
+ ip_mc_clear_src(im);
+}
+
+static void igmp_group_added(struct ip_mc_list *im)
+{
+ struct in_device *in_dev = im->interface;
+
+ if (im->loaded == 0) {
+ im->loaded = 1;
+ ip_mc_filter_add(in_dev, im->multiaddr);
+ }
+
+#ifdef CONFIG_IP_MULTICAST
+ if (im->multiaddr == IGMP_ALL_HOSTS)
+ return;
+
+ if (in_dev->dead)
+ return;
+ if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
+ spin_lock_bh(&im->lock);
+ igmp_start_timer(im, IGMP_Initial_Report_Delay);
+ spin_unlock_bh(&im->lock);
+ return;
+ }
+ /* else, v3 */
+
+ im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
+ IGMP_Unsolicited_Report_Count;
+ igmp_ifc_event(in_dev);
+#endif
+}
+
+
+/*
+ * Multicast list managers
+ */
+
+
+/*
+ * A socket has joined a multicast group on device dev.
+ */
+
+void ip_mc_inc_group(struct in_device *in_dev, u32 addr)
+{
+ struct ip_mc_list *im;
+
+ ASSERT_RTNL();
+
+ for (im=in_dev->mc_list; im; im=im->next) {
+ if (im->multiaddr == addr) {
+ im->users++;
+ ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0);
+ goto out;
+ }
+ }
+
+ im = (struct ip_mc_list *)kmalloc(sizeof(*im), GFP_KERNEL);
+ if (!im)
+ goto out;
+
+ im->users=1;
+ im->interface=in_dev;
+ in_dev_hold(in_dev);
+ im->multiaddr=addr;
+ /* initial mode is (EX, empty) */
+ im->sfmode = MCAST_EXCLUDE;
+ im->sfcount[MCAST_INCLUDE] = 0;
+ im->sfcount[MCAST_EXCLUDE] = 1;
+ im->sources = NULL;
+ im->tomb = NULL;
+ im->crcount = 0;
+ atomic_set(&im->refcnt, 1);
+ spin_lock_init(&im->lock);
+#ifdef CONFIG_IP_MULTICAST
+ im->tm_running=0;
+ init_timer(&im->timer);
+ im->timer.data=(unsigned long)im;
+ im->timer.function=&igmp_timer_expire;
+ im->unsolicit_count = IGMP_Unsolicited_Report_Count;
+ im->reporter = 0;
+ im->gsquery = 0;
+#endif
+ im->loaded = 0;
+ write_lock_bh(&in_dev->mc_list_lock);
+ im->next=in_dev->mc_list;
+ in_dev->mc_list=im;
+ write_unlock_bh(&in_dev->mc_list_lock);
+#ifdef CONFIG_IP_MULTICAST
+ igmpv3_del_delrec(in_dev, im->multiaddr);
+#endif
+ igmp_group_added(im);
+ if (!in_dev->dead)
+ ip_rt_multicast_event(in_dev);
+out:
+ return;
+}
+
+/*
+ * A socket has left a multicast group on device dev
+ */
+
+void ip_mc_dec_group(struct in_device *in_dev, u32 addr)
+{
+ struct ip_mc_list *i, **ip;
+
+ ASSERT_RTNL();
+
+ for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) {
+ if (i->multiaddr==addr) {
+ if (--i->users == 0) {
+ write_lock_bh(&in_dev->mc_list_lock);
+ *ip = i->next;
+ write_unlock_bh(&in_dev->mc_list_lock);
+ igmp_group_dropped(i);
+
+ if (!in_dev->dead)
+ ip_rt_multicast_event(in_dev);
+
+ ip_ma_put(i);
+ return;
+ }
+ break;
+ }
+ }
+}
+
+/* Device going down */
+
+void ip_mc_down(struct in_device *in_dev)
+{
+ struct ip_mc_list *i;
+
+ ASSERT_RTNL();
+
+ for (i=in_dev->mc_list; i; i=i->next)
+ igmp_group_dropped(i);
+
+#ifdef CONFIG_IP_MULTICAST
+ in_dev->mr_ifc_count = 0;
+ if (del_timer(&in_dev->mr_ifc_timer))
+ __in_dev_put(in_dev);
+ in_dev->mr_gq_running = 0;
+ if (del_timer(&in_dev->mr_gq_timer))
+ __in_dev_put(in_dev);
+ igmpv3_clear_delrec(in_dev);
+#endif
+
+ ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS);
+}
+
+void ip_mc_init_dev(struct in_device *in_dev)
+{
+ ASSERT_RTNL();
+
+ in_dev->mc_tomb = NULL;
+#ifdef CONFIG_IP_MULTICAST
+ in_dev->mr_gq_running = 0;
+ init_timer(&in_dev->mr_gq_timer);
+ in_dev->mr_gq_timer.data=(unsigned long) in_dev;
+ in_dev->mr_gq_timer.function=&igmp_gq_timer_expire;
+ in_dev->mr_ifc_count = 0;
+ init_timer(&in_dev->mr_ifc_timer);
+ in_dev->mr_ifc_timer.data=(unsigned long) in_dev;
+ in_dev->mr_ifc_timer.function=&igmp_ifc_timer_expire;
+ in_dev->mr_qrv = IGMP_Unsolicited_Report_Count;
+#endif
+
+ rwlock_init(&in_dev->mc_list_lock);
+ spin_lock_init(&in_dev->mc_tomb_lock);
+}
+
+/* Device going up */
+
+void ip_mc_up(struct in_device *in_dev)
+{
+ struct ip_mc_list *i;
+
+ ASSERT_RTNL();
+
+ ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
+
+ for (i=in_dev->mc_list; i; i=i->next)
+ igmp_group_added(i);
+}
+
+/*
+ * Device is about to be destroyed: clean up.
+ */
+
+void ip_mc_destroy_dev(struct in_device *in_dev)
+{
+ struct ip_mc_list *i;
+
+ ASSERT_RTNL();
+
+ /* Deactivate timers */
+ ip_mc_down(in_dev);
+
+ write_lock_bh(&in_dev->mc_list_lock);
+ while ((i = in_dev->mc_list) != NULL) {
+ in_dev->mc_list = i->next;
+ write_unlock_bh(&in_dev->mc_list_lock);
+
+ igmp_group_dropped(i);
+ ip_ma_put(i);
+
+ write_lock_bh(&in_dev->mc_list_lock);
+ }
+ write_unlock_bh(&in_dev->mc_list_lock);
+}
+
+static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr)
+{
+ struct flowi fl = { .nl_u = { .ip4_u =
+ { .daddr = imr->imr_multiaddr.s_addr } } };
+ struct rtable *rt;
+ struct net_device *dev = NULL;
+ struct in_device *idev = NULL;
+
+ if (imr->imr_ifindex) {
+ idev = inetdev_by_index(imr->imr_ifindex);
+ if (idev)
+ __in_dev_put(idev);
+ return idev;
+ }
+ if (imr->imr_address.s_addr) {
+ dev = ip_dev_find(imr->imr_address.s_addr);
+ if (!dev)
+ return NULL;
+ __dev_put(dev);
+ }
+
+ if (!dev && !ip_route_output_key(&rt, &fl)) {
+ dev = rt->u.dst.dev;
+ ip_rt_put(rt);
+ }
+ if (dev) {
+ imr->imr_ifindex = dev->ifindex;
+ idev = __in_dev_get(dev);
+ }
+ return idev;
+}
+
+/*
+ * Join a socket to a group
+ */
+int sysctl_igmp_max_memberships = IP_MAX_MEMBERSHIPS;
+int sysctl_igmp_max_msf = IP_MAX_MSF;
+
+
+static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
+ __u32 *psfsrc)
+{
+ struct ip_sf_list *psf, *psf_prev;
+ int rv = 0;
+
+ psf_prev = NULL;
+ for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ if (psf->sf_inaddr == *psfsrc)
+ break;
+ psf_prev = psf;
+ }
+ if (!psf || psf->sf_count[sfmode] == 0) {
+ /* source filter not found, or count wrong => bug */
+ return -ESRCH;
+ }
+ psf->sf_count[sfmode]--;
+ if (psf->sf_count[sfmode] == 0) {
+ ip_rt_multicast_event(pmc->interface);
+ }
+ if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
+#ifdef CONFIG_IP_MULTICAST
+ struct in_device *in_dev = pmc->interface;
+#endif
+
+ /* no more filters for this source */
+ if (psf_prev)
+ psf_prev->sf_next = psf->sf_next;
+ else
+ pmc->sources = psf->sf_next;
+#ifdef CONFIG_IP_MULTICAST
+ if (psf->sf_oldin &&
+ !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) {
+ psf->sf_crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
+ IGMP_Unsolicited_Report_Count;
+ psf->sf_next = pmc->tomb;
+ pmc->tomb = psf;
+ rv = 1;
+ } else
+#endif
+ kfree(psf);
+ }
+ return rv;
+}
+
+#ifndef CONFIG_IP_MULTICAST
+#define igmp_ifc_event(x) do { } while (0)
+#endif
+
+static int ip_mc_del_src(struct in_device *in_dev, __u32 *pmca, int sfmode,
+ int sfcount, __u32 *psfsrc, int delta)
+{
+ struct ip_mc_list *pmc;
+ int changerec = 0;
+ int i, err;
+
+ if (!in_dev)
+ return -ENODEV;
+ read_lock(&in_dev->mc_list_lock);
+ for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+ if (*pmca == pmc->multiaddr)
+ break;
+ }
+ if (!pmc) {
+ /* MCA not found?? bug */
+ read_unlock(&in_dev->mc_list_lock);
+ return -ESRCH;
+ }
+ spin_lock_bh(&pmc->lock);
+ read_unlock(&in_dev->mc_list_lock);
+#ifdef CONFIG_IP_MULTICAST
+ sf_markstate(pmc);
+#endif
+ if (!delta) {
+ err = -EINVAL;
+ if (!pmc->sfcount[sfmode])
+ goto out_unlock;
+ pmc->sfcount[sfmode]--;
+ }
+ err = 0;
+ for (i=0; i<sfcount; i++) {
+ int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);
+
+ changerec |= rv > 0;
+ if (!err && rv < 0)
+ err = rv;
+ }
+ if (pmc->sfmode == MCAST_EXCLUDE &&
+ pmc->sfcount[MCAST_EXCLUDE] == 0 &&
+ pmc->sfcount[MCAST_INCLUDE]) {
+#ifdef CONFIG_IP_MULTICAST
+ struct ip_sf_list *psf;
+#endif
+
+ /* filter mode change */
+ pmc->sfmode = MCAST_INCLUDE;
+#ifdef CONFIG_IP_MULTICAST
+ pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
+ IGMP_Unsolicited_Report_Count;
+ in_dev->mr_ifc_count = pmc->crcount;
+ for (psf=pmc->sources; psf; psf = psf->sf_next)
+ psf->sf_crcount = 0;
+ igmp_ifc_event(pmc->interface);
+ } else if (sf_setstate(pmc) || changerec) {
+ igmp_ifc_event(pmc->interface);
+#endif
+ }
+out_unlock:
+ spin_unlock_bh(&pmc->lock);
+ return err;
+}
+
+/*
+ * Add multicast single-source filter to the interface list
+ */
+static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode,
+ __u32 *psfsrc, int delta)
+{
+ struct ip_sf_list *psf, *psf_prev;
+
+ psf_prev = NULL;
+ for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ if (psf->sf_inaddr == *psfsrc)
+ break;
+ psf_prev = psf;
+ }
+ if (!psf) {
+ psf = (struct ip_sf_list *)kmalloc(sizeof(*psf), GFP_ATOMIC);
+ if (!psf)
+ return -ENOBUFS;
+ memset(psf, 0, sizeof(*psf));
+ psf->sf_inaddr = *psfsrc;
+ if (psf_prev) {
+ psf_prev->sf_next = psf;
+ } else
+ pmc->sources = psf;
+ }
+ psf->sf_count[sfmode]++;
+ if (psf->sf_count[sfmode] == 1) {
+ ip_rt_multicast_event(pmc->interface);
+ }
+ return 0;
+}
+
+#ifdef CONFIG_IP_MULTICAST
+static void sf_markstate(struct ip_mc_list *pmc)
+{
+ struct ip_sf_list *psf;
+ int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
+
+ for (psf=pmc->sources; psf; psf=psf->sf_next)
+ if (pmc->sfcount[MCAST_EXCLUDE]) {
+ psf->sf_oldin = mca_xcount ==
+ psf->sf_count[MCAST_EXCLUDE] &&
+ !psf->sf_count[MCAST_INCLUDE];
+ } else
+ psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0;
+}
+
+static int sf_setstate(struct ip_mc_list *pmc)
+{
+ struct ip_sf_list *psf;
+ int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
+ int qrv = pmc->interface->mr_qrv;
+ int new_in, rv;
+
+ rv = 0;
+ for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ if (pmc->sfcount[MCAST_EXCLUDE]) {
+ new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&
+ !psf->sf_count[MCAST_INCLUDE];
+ } else
+ new_in = psf->sf_count[MCAST_INCLUDE] != 0;
+ if (new_in != psf->sf_oldin) {
+ psf->sf_crcount = qrv;
+ rv++;
+ }
+ }
+ return rv;
+}
+#endif
+
+/*
+ * Add multicast source filter list to the interface list
+ */
+static int ip_mc_add_src(struct in_device *in_dev, __u32 *pmca, int sfmode,
+ int sfcount, __u32 *psfsrc, int delta)
+{
+ struct ip_mc_list *pmc;
+ int isexclude;
+ int i, err;
+
+ if (!in_dev)
+ return -ENODEV;
+ read_lock(&in_dev->mc_list_lock);
+ for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+ if (*pmca == pmc->multiaddr)
+ break;
+ }
+ if (!pmc) {
+ /* MCA not found?? bug */
+ read_unlock(&in_dev->mc_list_lock);
+ return -ESRCH;
+ }
+ spin_lock_bh(&pmc->lock);
+ read_unlock(&in_dev->mc_list_lock);
+
+#ifdef CONFIG_IP_MULTICAST
+ sf_markstate(pmc);
+#endif
+ isexclude = pmc->sfmode == MCAST_EXCLUDE;
+ if (!delta)
+ pmc->sfcount[sfmode]++;
+ err = 0;
+ for (i=0; i<sfcount; i++) {
+ err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i], delta);
+ if (err)
+ break;
+ }
+ if (err) {
+ int j;
+
+ pmc->sfcount[sfmode]--;
+ for (j=0; j<i; j++)
+ (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);
+ } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
+#ifdef CONFIG_IP_MULTICAST
+ struct in_device *in_dev = pmc->interface;
+ struct ip_sf_list *psf;
+#endif
+
+ /* filter mode change */
+ if (pmc->sfcount[MCAST_EXCLUDE])
+ pmc->sfmode = MCAST_EXCLUDE;
+ else if (pmc->sfcount[MCAST_INCLUDE])
+ pmc->sfmode = MCAST_INCLUDE;
+#ifdef CONFIG_IP_MULTICAST
+ /* else no filters; keep old mode for reports */
+
+ pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
+ IGMP_Unsolicited_Report_Count;
+ in_dev->mr_ifc_count = pmc->crcount;
+ for (psf=pmc->sources; psf; psf = psf->sf_next)
+ psf->sf_crcount = 0;
+ igmp_ifc_event(in_dev);
+ } else if (sf_setstate(pmc)) {
+ igmp_ifc_event(in_dev);
+#endif
+ }
+ spin_unlock_bh(&pmc->lock);
+ return err;
+}
+
+static void ip_mc_clear_src(struct ip_mc_list *pmc)
+{
+ struct ip_sf_list *psf, *nextpsf;
+
+ for (psf=pmc->tomb; psf; psf=nextpsf) {
+ nextpsf = psf->sf_next;
+ kfree(psf);
+ }
+ pmc->tomb = NULL;
+ for (psf=pmc->sources; psf; psf=nextpsf) {
+ nextpsf = psf->sf_next;
+ kfree(psf);
+ }
+ pmc->sources = NULL;
+ pmc->sfmode = MCAST_EXCLUDE;
+ pmc->sfcount[MCAST_EXCLUDE] = 0;
+ pmc->sfcount[MCAST_EXCLUDE] = 1;
+}
+
+
+/*
+ * Join a multicast group
+ */
+int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
+{
+ int err;
+ u32 addr = imr->imr_multiaddr.s_addr;
+ struct ip_mc_socklist *iml, *i;
+ struct in_device *in_dev;
+ struct inet_sock *inet = inet_sk(sk);
+ int count = 0;
+
+ if (!MULTICAST(addr))
+ return -EINVAL;
+
+ rtnl_shlock();
+
+ in_dev = ip_mc_find_dev(imr);
+
+ if (!in_dev) {
+ iml = NULL;
+ err = -ENODEV;
+ goto done;
+ }
+
+ iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
+
+ err = -EADDRINUSE;
+ for (i = inet->mc_list; i; i = i->next) {
+ if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) {
+ /* New style additions are reference counted */
+ if (imr->imr_address.s_addr == 0) {
+ i->count++;
+ err = 0;
+ }
+ goto done;
+ }
+ count++;
+ }
+ err = -ENOBUFS;
+ if (iml == NULL || count >= sysctl_igmp_max_memberships)
+ goto done;
+ memcpy(&iml->multi, imr, sizeof(*imr));
+ iml->next = inet->mc_list;
+ iml->count = 1;
+ iml->sflist = NULL;
+ iml->sfmode = MCAST_EXCLUDE;
+ inet->mc_list = iml;
+ ip_mc_inc_group(in_dev, addr);
+ iml = NULL;
+ err = 0;
+
+done:
+ rtnl_shunlock();
+ if (iml)
+ sock_kfree_s(sk, iml, sizeof(*iml));
+ return err;
+}
+
+static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
+ struct in_device *in_dev)
+{
+ int err;
+
+ if (iml->sflist == 0) {
+ /* any-source empty exclude case */
+ return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
+ iml->sfmode, 0, NULL, 0);
+ }
+ err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
+ iml->sfmode, iml->sflist->sl_count,
+ iml->sflist->sl_addr, 0);
+ sock_kfree_s(sk, iml->sflist, IP_SFLSIZE(iml->sflist->sl_max));
+ iml->sflist = NULL;
+ return err;
+}
+
+/*
+ * Ask a socket to leave a group.
+ */
+
+int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_mc_socklist *iml, **imlp;
+
+ rtnl_lock();
+ for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) {
+ if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr &&
+ iml->multi.imr_address.s_addr==imr->imr_address.s_addr &&
+ (!imr->imr_ifindex || iml->multi.imr_ifindex==imr->imr_ifindex)) {
+ struct in_device *in_dev;
+
+ in_dev = inetdev_by_index(iml->multi.imr_ifindex);
+ if (in_dev)
+ (void) ip_mc_leave_src(sk, iml, in_dev);
+ if (--iml->count) {
+ rtnl_unlock();
+ if (in_dev)
+ in_dev_put(in_dev);
+ return 0;
+ }
+
+ *imlp = iml->next;
+
+ if (in_dev) {
+ ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr);
+ in_dev_put(in_dev);
+ }
+ rtnl_unlock();
+ sock_kfree_s(sk, iml, sizeof(*iml));
+ return 0;
+ }
+ }
+ rtnl_unlock();
+ return -EADDRNOTAVAIL;
+}
+
+int ip_mc_source(int add, int omode, struct sock *sk, struct
+ ip_mreq_source *mreqs, int ifindex)
+{
+ int err;
+ struct ip_mreqn imr;
+ u32 addr = mreqs->imr_multiaddr;
+ struct ip_mc_socklist *pmc;
+ struct in_device *in_dev = NULL;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_sf_socklist *psl;
+ int i, j, rv;
+
+ if (!MULTICAST(addr))
+ return -EINVAL;
+
+ rtnl_shlock();
+
+ imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr;
+ imr.imr_address.s_addr = mreqs->imr_interface;
+ imr.imr_ifindex = ifindex;
+ in_dev = ip_mc_find_dev(&imr);
+
+ if (!in_dev) {
+ err = -ENODEV;
+ goto done;
+ }
+ err = -EADDRNOTAVAIL;
+
+ for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+ if (memcmp(&pmc->multi, mreqs, 2*sizeof(__u32)) == 0)
+ break;
+ }
+ if (!pmc) /* must have a prior join */
+ goto done;
+ /* if a source filter was set, must be the same mode as before */
+ if (pmc->sflist) {
+ if (pmc->sfmode != omode)
+ goto done;
+ } else if (pmc->sfmode != omode) {
+ /* allow mode switches for empty-set filters */
+ ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0);
+ ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, pmc->sfmode, 0,
+ NULL, 0);
+ pmc->sfmode = omode;
+ }
+
+ psl = pmc->sflist;
+ if (!add) {
+ if (!psl)
+ goto done;
+ rv = !0;
+ for (i=0; i<psl->sl_count; i++) {
+ rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
+ sizeof(__u32));
+ if (rv == 0)
+ break;
+ }
+ if (rv) /* source not found */
+ goto done;
+
+ /* update the interface filter */
+ ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
+ &mreqs->imr_sourceaddr, 1);
+
+ for (j=i+1; j<psl->sl_count; j++)
+ psl->sl_addr[j-1] = psl->sl_addr[j];
+ psl->sl_count--;
+ err = 0;
+ goto done;
+ }
+ /* else, add a new source to the filter */
+
+ if (psl && psl->sl_count >= sysctl_igmp_max_msf) {
+ err = -ENOBUFS;
+ goto done;
+ }
+ if (!psl || psl->sl_count == psl->sl_max) {
+ struct ip_sf_socklist *newpsl;
+ int count = IP_SFBLOCK;
+
+ if (psl)
+ count += psl->sl_max;
+ newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk,
+ IP_SFLSIZE(count), GFP_KERNEL);
+ if (!newpsl) {
+ err = -ENOBUFS;
+ goto done;
+ }
+ newpsl->sl_max = count;
+ newpsl->sl_count = count - IP_SFBLOCK;
+ if (psl) {
+ for (i=0; i<psl->sl_count; i++)
+ newpsl->sl_addr[i] = psl->sl_addr[i];
+ sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max));
+ }
+ pmc->sflist = psl = newpsl;
+ }
+ rv = 1; /* > 0 for insert logic below if sl_count is 0 */
+ for (i=0; i<psl->sl_count; i++) {
+ rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
+ sizeof(__u32));
+ if (rv == 0)
+ break;
+ }
+ if (rv == 0) /* address already there is an error */
+ goto done;
+ for (j=psl->sl_count-1; j>=i; j--)
+ psl->sl_addr[j+1] = psl->sl_addr[j];
+ psl->sl_addr[i] = mreqs->imr_sourceaddr;
+ psl->sl_count++;
+ err = 0;
+ /* update the interface list */
+ ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
+ &mreqs->imr_sourceaddr, 1);
+done:
+ rtnl_shunlock();
+ return err;
+}
+
+int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
+{
+ int err;
+ struct ip_mreqn imr;
+ u32 addr = msf->imsf_multiaddr;
+ struct ip_mc_socklist *pmc;
+ struct in_device *in_dev;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_sf_socklist *newpsl, *psl;
+
+ if (!MULTICAST(addr))
+ return -EINVAL;
+ if (msf->imsf_fmode != MCAST_INCLUDE &&
+ msf->imsf_fmode != MCAST_EXCLUDE)
+ return -EINVAL;
+
+ rtnl_shlock();
+
+ imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
+ imr.imr_address.s_addr = msf->imsf_interface;
+ imr.imr_ifindex = ifindex;
+ in_dev = ip_mc_find_dev(&imr);
+
+ if (!in_dev) {
+ err = -ENODEV;
+ goto done;
+ }
+ err = -EADDRNOTAVAIL;
+
+ for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+ if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
+ pmc->multi.imr_ifindex == imr.imr_ifindex)
+ break;
+ }
+ if (!pmc) /* must have a prior join */
+ goto done;
+ if (msf->imsf_numsrc) {
+ newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk,
+ IP_SFLSIZE(msf->imsf_numsrc), GFP_KERNEL);
+ if (!newpsl) {
+ err = -ENOBUFS;
+ goto done;
+ }
+ newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc;
+ memcpy(newpsl->sl_addr, msf->imsf_slist,
+ msf->imsf_numsrc * sizeof(msf->imsf_slist[0]));
+ err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
+ msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0);
+ if (err) {
+ sock_kfree_s(sk, newpsl, IP_SFLSIZE(newpsl->sl_max));
+ goto done;
+ }
+ } else
+ newpsl = NULL;
+ psl = pmc->sflist;
+ if (psl) {
+ (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
+ psl->sl_count, psl->sl_addr, 0);
+ sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max));
+ } else
+ (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
+ 0, NULL, 0);
+ pmc->sflist = newpsl;
+ pmc->sfmode = msf->imsf_fmode;
+done:
+ rtnl_shunlock();
+ return err;
+}
+
+int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
+ struct ip_msfilter __user *optval, int __user *optlen)
+{
+ int err, len, count, copycount;
+ struct ip_mreqn imr;
+ u32 addr = msf->imsf_multiaddr;
+ struct ip_mc_socklist *pmc;
+ struct in_device *in_dev;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_sf_socklist *psl;
+
+ if (!MULTICAST(addr))
+ return -EINVAL;
+
+ rtnl_shlock();
+
+ imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
+ imr.imr_address.s_addr = msf->imsf_interface;
+ imr.imr_ifindex = 0;
+ in_dev = ip_mc_find_dev(&imr);
+
+ if (!in_dev) {
+ err = -ENODEV;
+ goto done;
+ }
+ err = -EADDRNOTAVAIL;
+
+ for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+ if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
+ pmc->multi.imr_ifindex == imr.imr_ifindex)
+ break;
+ }
+ if (!pmc) /* must have a prior join */
+ goto done;
+ msf->imsf_fmode = pmc->sfmode;
+ psl = pmc->sflist;
+ rtnl_shunlock();
+ if (!psl) {
+ len = 0;
+ count = 0;
+ } else {
+ count = psl->sl_count;
+ }
+ copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc;
+ len = copycount * sizeof(psl->sl_addr[0]);
+ msf->imsf_numsrc = count;
+ if (put_user(IP_MSFILTER_SIZE(copycount), optlen) ||
+ copy_to_user(optval, msf, IP_MSFILTER_SIZE(0))) {
+ return -EFAULT;
+ }
+ if (len &&
+ copy_to_user(&optval->imsf_slist[0], psl->sl_addr, len))
+ return -EFAULT;
+ return 0;
+done:
+ rtnl_shunlock();
+ return err;
+}
+
+int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
+ struct group_filter __user *optval, int __user *optlen)
+{
+ int err, i, count, copycount;
+ struct sockaddr_in *psin;
+ u32 addr;
+ struct ip_mc_socklist *pmc;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_sf_socklist *psl;
+
+ psin = (struct sockaddr_in *)&gsf->gf_group;
+ if (psin->sin_family != AF_INET)
+ return -EINVAL;
+ addr = psin->sin_addr.s_addr;
+ if (!MULTICAST(addr))
+ return -EINVAL;
+
+ rtnl_shlock();
+
+ err = -EADDRNOTAVAIL;
+
+ for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+ if (pmc->multi.imr_multiaddr.s_addr == addr &&
+ pmc->multi.imr_ifindex == gsf->gf_interface)
+ break;
+ }
+ if (!pmc) /* must have a prior join */
+ goto done;
+ gsf->gf_fmode = pmc->sfmode;
+ psl = pmc->sflist;
+ rtnl_shunlock();
+ count = psl ? psl->sl_count : 0;
+ copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
+ gsf->gf_numsrc = count;
+ if (put_user(GROUP_FILTER_SIZE(copycount), optlen) ||
+ copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) {
+ return -EFAULT;
+ }
+ for (i=0; i<copycount; i++) {
+ struct sockaddr_in *psin;
+ struct sockaddr_storage ss;
+
+ psin = (struct sockaddr_in *)&ss;
+ memset(&ss, 0, sizeof(ss));
+ psin->sin_family = AF_INET;
+ psin->sin_addr.s_addr = psl->sl_addr[i];
+ if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss)))
+ return -EFAULT;
+ }
+ return 0;
+done:
+ rtnl_shunlock();
+ return err;
+}
+
+/*
+ * check if a multicast source filter allows delivery for a given <src,dst,intf>
+ */
+int ip_mc_sf_allow(struct sock *sk, u32 loc_addr, u32 rmt_addr, int dif)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_mc_socklist *pmc;
+ struct ip_sf_socklist *psl;
+ int i;
+
+ if (!MULTICAST(loc_addr))
+ return 1;
+
+ for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+ if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
+ pmc->multi.imr_ifindex == dif)
+ break;
+ }
+ if (!pmc)
+ return 1;
+ psl = pmc->sflist;
+ if (!psl)
+ return pmc->sfmode == MCAST_EXCLUDE;
+
+ for (i=0; i<psl->sl_count; i++) {
+ if (psl->sl_addr[i] == rmt_addr)
+ break;
+ }
+ if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count)
+ return 0;
+ if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
+ return 0;
+ return 1;
+}
+
+/*
+ * A socket is closing.
+ */
+
+void ip_mc_drop_socket(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_mc_socklist *iml;
+
+ if (inet->mc_list == NULL)
+ return;
+
+ rtnl_lock();
+ while ((iml = inet->mc_list) != NULL) {
+ struct in_device *in_dev;
+ inet->mc_list = iml->next;
+
+ if ((in_dev = inetdev_by_index(iml->multi.imr_ifindex)) != NULL) {
+ (void) ip_mc_leave_src(sk, iml, in_dev);
+ ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
+ in_dev_put(in_dev);
+ }
+ sock_kfree_s(sk, iml, sizeof(*iml));
+
+ }
+ rtnl_unlock();
+}
+
+int ip_check_mc(struct in_device *in_dev, u32 mc_addr, u32 src_addr, u16 proto)
+{
+ struct ip_mc_list *im;
+ struct ip_sf_list *psf;
+ int rv = 0;
+
+ read_lock(&in_dev->mc_list_lock);
+ for (im=in_dev->mc_list; im; im=im->next) {
+ if (im->multiaddr == mc_addr)
+ break;
+ }
+ if (im && proto == IPPROTO_IGMP) {
+ rv = 1;
+ } else if (im) {
+ if (src_addr) {
+ for (psf=im->sources; psf; psf=psf->sf_next) {
+ if (psf->sf_inaddr == src_addr)
+ break;
+ }
+ if (psf)
+ rv = psf->sf_count[MCAST_INCLUDE] ||
+ psf->sf_count[MCAST_EXCLUDE] !=
+ im->sfcount[MCAST_EXCLUDE];
+ else
+ rv = im->sfcount[MCAST_EXCLUDE] != 0;
+ } else
+ rv = 1; /* unspecified source; tentatively allow */
+ }
+ read_unlock(&in_dev->mc_list_lock);
+ return rv;
+}
+
+#if defined(CONFIG_PROC_FS)
+struct igmp_mc_iter_state {
+ struct net_device *dev;
+ struct in_device *in_dev;
+};
+
+#define igmp_mc_seq_private(seq) ((struct igmp_mc_iter_state *)(seq)->private)
+
+static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
+{
+ struct ip_mc_list *im = NULL;
+ struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+
+ for (state->dev = dev_base, state->in_dev = NULL;
+ state->dev;
+ state->dev = state->dev->next) {
+ struct in_device *in_dev;
+ in_dev = in_dev_get(state->dev);
+ if (!in_dev)
+ continue;
+ read_lock(&in_dev->mc_list_lock);
+ im = in_dev->mc_list;
+ if (im) {
+ state->in_dev = in_dev;
+ break;
+ }
+ read_unlock(&in_dev->mc_list_lock);
+ in_dev_put(in_dev);
+ }
+ return im;
+}
+
+static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im)
+{
+ struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+ im = im->next;
+ while (!im) {
+ if (likely(state->in_dev != NULL)) {
+ read_unlock(&state->in_dev->mc_list_lock);
+ in_dev_put(state->in_dev);
+ }
+ state->dev = state->dev->next;
+ if (!state->dev) {
+ state->in_dev = NULL;
+ break;
+ }
+ state->in_dev = in_dev_get(state->dev);
+ if (!state->in_dev)
+ continue;
+ read_lock(&state->in_dev->mc_list_lock);
+ im = state->in_dev->mc_list;
+ }
+ return im;
+}
+
+static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct ip_mc_list *im = igmp_mc_get_first(seq);
+ if (im)
+ while (pos && (im = igmp_mc_get_next(seq, im)) != NULL)
+ --pos;
+ return pos ? NULL : im;
+}
+
+static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ read_lock(&dev_base_lock);
+ return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct ip_mc_list *im;
+ if (v == SEQ_START_TOKEN)
+ im = igmp_mc_get_first(seq);
+ else
+ im = igmp_mc_get_next(seq, v);
+ ++*pos;
+ return im;
+}
+
+static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
+{
+ struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+ if (likely(state->in_dev != NULL)) {
+ read_unlock(&state->in_dev->mc_list_lock);
+ in_dev_put(state->in_dev);
+ state->in_dev = NULL;
+ }
+ state->dev = NULL;
+ read_unlock(&dev_base_lock);
+}
+
+static int igmp_mc_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq,
+ "Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n");
+ else {
+ struct ip_mc_list *im = (struct ip_mc_list *)v;
+ struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+ char *querier;
+#ifdef CONFIG_IP_MULTICAST
+ querier = IGMP_V1_SEEN(state->in_dev) ? "V1" :
+ IGMP_V2_SEEN(state->in_dev) ? "V2" :
+ "V3";
+#else
+ querier = "NONE";
+#endif
+
+ if (state->in_dev->mc_list == im) {
+ seq_printf(seq, "%d\t%-10s: %5d %7s\n",
+ state->dev->ifindex, state->dev->name, state->dev->mc_count, querier);
+ }
+
+ seq_printf(seq,
+ "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n",
+ im->multiaddr, im->users,
+ im->tm_running, im->tm_running ?
+ jiffies_to_clock_t(im->timer.expires-jiffies) : 0,
+ im->reporter);
+ }
+ return 0;
+}
+
+static struct seq_operations igmp_mc_seq_ops = {
+ .start = igmp_mc_seq_start,
+ .next = igmp_mc_seq_next,
+ .stop = igmp_mc_seq_stop,
+ .show = igmp_mc_seq_show,
+};
+
+static int igmp_mc_seq_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int rc = -ENOMEM;
+ struct igmp_mc_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (!s)
+ goto out;
+ rc = seq_open(file, &igmp_mc_seq_ops);
+ if (rc)
+ goto out_kfree;
+
+ seq = file->private_data;
+ seq->private = s;
+ memset(s, 0, sizeof(*s));
+out:
+ return rc;
+out_kfree:
+ kfree(s);
+ goto out;
+}
+
+static struct file_operations igmp_mc_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = igmp_mc_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+struct igmp_mcf_iter_state {
+ struct net_device *dev;
+ struct in_device *idev;
+ struct ip_mc_list *im;
+};
+
+#define igmp_mcf_seq_private(seq) ((struct igmp_mcf_iter_state *)(seq)->private)
+
+static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
+{
+ struct ip_sf_list *psf = NULL;
+ struct ip_mc_list *im = NULL;
+ struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+
+ for (state->dev = dev_base, state->idev = NULL, state->im = NULL;
+ state->dev;
+ state->dev = state->dev->next) {
+ struct in_device *idev;
+ idev = in_dev_get(state->dev);
+ if (unlikely(idev == NULL))
+ continue;
+ read_lock(&idev->mc_list_lock);
+ im = idev->mc_list;
+ if (likely(im != NULL)) {
+ spin_lock_bh(&im->lock);
+ psf = im->sources;
+ if (likely(psf != NULL)) {
+ state->im = im;
+ state->idev = idev;
+ break;
+ }
+ spin_unlock_bh(&im->lock);
+ }
+ read_unlock(&idev->mc_list_lock);
+ in_dev_put(idev);
+ }
+ return psf;
+}
+
+static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_list *psf)
+{
+ struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+
+ psf = psf->sf_next;
+ while (!psf) {
+ spin_unlock_bh(&state->im->lock);
+ state->im = state->im->next;
+ while (!state->im) {
+ if (likely(state->idev != NULL)) {
+ read_unlock(&state->idev->mc_list_lock);
+ in_dev_put(state->idev);
+ }
+ state->dev = state->dev->next;
+ if (!state->dev) {
+ state->idev = NULL;
+ goto out;
+ }
+ state->idev = in_dev_get(state->dev);
+ if (!state->idev)
+ continue;
+ read_lock(&state->idev->mc_list_lock);
+ state->im = state->idev->mc_list;
+ }
+ if (!state->im)
+ break;
+ spin_lock_bh(&state->im->lock);
+ psf = state->im->sources;
+ }
+out:
+ return psf;
+}
+
+static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct ip_sf_list *psf = igmp_mcf_get_first(seq);
+ if (psf)
+ while (pos && (psf = igmp_mcf_get_next(seq, psf)) != NULL)
+ --pos;
+ return pos ? NULL : psf;
+}
+
+static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ read_lock(&dev_base_lock);
+ return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct ip_sf_list *psf;
+ if (v == SEQ_START_TOKEN)
+ psf = igmp_mcf_get_first(seq);
+ else
+ psf = igmp_mcf_get_next(seq, v);
+ ++*pos;
+ return psf;
+}
+
+static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
+{
+ struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+ if (likely(state->im != NULL)) {
+ spin_unlock_bh(&state->im->lock);
+ state->im = NULL;
+ }
+ if (likely(state->idev != NULL)) {
+ read_unlock(&state->idev->mc_list_lock);
+ in_dev_put(state->idev);
+ state->idev = NULL;
+ }
+ state->dev = NULL;
+ read_unlock(&dev_base_lock);
+}
+
+static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
+{
+ struct ip_sf_list *psf = (struct ip_sf_list *)v;
+ struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq,
+ "%3s %6s "
+ "%10s %10s %6s %6s\n", "Idx",
+ "Device", "MCA",
+ "SRC", "INC", "EXC");
+ } else {
+ seq_printf(seq,
+ "%3d %6.6s 0x%08x "
+ "0x%08x %6lu %6lu\n",
+ state->dev->ifindex, state->dev->name,
+ ntohl(state->im->multiaddr),
+ ntohl(psf->sf_inaddr),
+ psf->sf_count[MCAST_INCLUDE],
+ psf->sf_count[MCAST_EXCLUDE]);
+ }
+ return 0;
+}
+
+static struct seq_operations igmp_mcf_seq_ops = {
+ .start = igmp_mcf_seq_start,
+ .next = igmp_mcf_seq_next,
+ .stop = igmp_mcf_seq_stop,
+ .show = igmp_mcf_seq_show,
+};
+
+static int igmp_mcf_seq_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int rc = -ENOMEM;
+ struct igmp_mcf_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (!s)
+ goto out;
+ rc = seq_open(file, &igmp_mcf_seq_ops);
+ if (rc)
+ goto out_kfree;
+
+ seq = file->private_data;
+ seq->private = s;
+ memset(s, 0, sizeof(*s));
+out:
+ return rc;
+out_kfree:
+ kfree(s);
+ goto out;
+}
+
+static struct file_operations igmp_mcf_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = igmp_mcf_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+int __init igmp_mc_proc_init(void)
+{
+ proc_net_fops_create("igmp", S_IRUGO, &igmp_mc_seq_fops);
+ proc_net_fops_create("mcfilter", S_IRUGO, &igmp_mcf_seq_fops);
+ return 0;
+}
+#endif
+
+EXPORT_SYMBOL(ip_mc_dec_group);
+EXPORT_SYMBOL(ip_mc_inc_group);
+EXPORT_SYMBOL(ip_mc_join_group);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
new file mode 100644
index 000000000000..95473953c406
--- /dev/null
+++ b/net/ipv4/inetpeer.c
@@ -0,0 +1,460 @@
+/*
+ * INETPEER - A storage for permanent information about peers
+ *
+ * This source is covered by the GNU GPL, the same as all kernel sources.
+ *
+ * Version: $Id: inetpeer.c,v 1.7 2001/09/20 21:22:50 davem Exp $
+ *
+ * Authors: Andrey V. Savochkin <saw@msu.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/net.h>
+#include <net/inetpeer.h>
+
+/*
+ * Theory of operations.
+ * We keep one entry for each peer IP address. The nodes contains long-living
+ * information about the peer which doesn't depend on routes.
+ * At this moment this information consists only of ID field for the next
+ * outgoing IP packet. This field is incremented with each packet as encoded
+ * in inet_getid() function (include/net/inetpeer.h).
+ * At the moment of writing this notes identifier of IP packets is generated
+ * to be unpredictable using this code only for packets subjected
+ * (actually or potentially) to defragmentation. I.e. DF packets less than
+ * PMTU in size uses a constant ID and do not use this code (see
+ * ip_select_ident() in include/net/ip.h).
+ *
+ * Route cache entries hold references to our nodes.
+ * New cache entries get references via lookup by destination IP address in
+ * the avl tree. The reference is grabbed only when it's needed i.e. only
+ * when we try to output IP packet which needs an unpredictable ID (see
+ * __ip_select_ident() in net/ipv4/route.c).
+ * Nodes are removed only when reference counter goes to 0.
+ * When it's happened the node may be removed when a sufficient amount of
+ * time has been passed since its last use. The less-recently-used entry can
+ * also be removed if the pool is overloaded i.e. if the total amount of
+ * entries is greater-or-equal than the threshold.
+ *
+ * Node pool is organised as an AVL tree.
+ * Such an implementation has been chosen not just for fun. It's a way to
+ * prevent easy and efficient DoS attacks by creating hash collisions. A huge
+ * amount of long living nodes in a single hash slot would significantly delay
+ * lookups performed with disabled BHs.
+ *
+ * Serialisation issues.
+ * 1. Nodes may appear in the tree only with the pool write lock held.
+ * 2. Nodes may disappear from the tree only with the pool write lock held
+ * AND reference count being 0.
+ * 3. Nodes appears and disappears from unused node list only under
+ * "inet_peer_unused_lock".
+ * 4. Global variable peer_total is modified under the pool lock.
+ * 5. struct inet_peer fields modification:
+ * avl_left, avl_right, avl_parent, avl_height: pool lock
+ * unused_next, unused_prevp: unused node list lock
+ * refcnt: atomically against modifications on other CPU;
+ * usually under some other lock to prevent node disappearing
+ * dtime: unused node list lock
+ * v4daddr: unchangeable
+ * ip_id_count: idlock
+ */
+
+/* Exported for inet_getid inline function. */
+DEFINE_SPINLOCK(inet_peer_idlock);
+
+static kmem_cache_t *peer_cachep;
+
+#define node_height(x) x->avl_height
+static struct inet_peer peer_fake_node = {
+ .avl_left = &peer_fake_node,
+ .avl_right = &peer_fake_node,
+ .avl_height = 0
+};
+#define peer_avl_empty (&peer_fake_node)
+static struct inet_peer *peer_root = peer_avl_empty;
+static DEFINE_RWLOCK(peer_pool_lock);
+#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
+
+static volatile int peer_total;
+/* Exported for sysctl_net_ipv4. */
+int inet_peer_threshold = 65536 + 128; /* start to throw entries more
+ * aggressively at this stage */
+int inet_peer_minttl = 120 * HZ; /* TTL under high load: 120 sec */
+int inet_peer_maxttl = 10 * 60 * HZ; /* usual time to live: 10 min */
+
+static struct inet_peer *inet_peer_unused_head;
+/* Exported for inet_putpeer inline function. */
+struct inet_peer **inet_peer_unused_tailp = &inet_peer_unused_head;
+DEFINE_SPINLOCK(inet_peer_unused_lock);
+#define PEER_MAX_CLEANUP_WORK 30
+
+static void peer_check_expire(unsigned long dummy);
+static struct timer_list peer_periodic_timer =
+ TIMER_INITIALIZER(peer_check_expire, 0, 0);
+
+/* Exported for sysctl_net_ipv4. */
+int inet_peer_gc_mintime = 10 * HZ,
+ inet_peer_gc_maxtime = 120 * HZ;
+
+/* Called from ip_output.c:ip_init */
+void __init inet_initpeers(void)
+{
+ struct sysinfo si;
+
+ /* Use the straight interface to information about memory. */
+ si_meminfo(&si);
+ /* The values below were suggested by Alexey Kuznetsov
+ * <kuznet@ms2.inr.ac.ru>. I don't have any opinion about the values
+ * myself. --SAW
+ */
+ if (si.totalram <= (32768*1024)/PAGE_SIZE)
+ inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */
+ if (si.totalram <= (16384*1024)/PAGE_SIZE)
+ inet_peer_threshold >>= 1; /* about 512KB */
+ if (si.totalram <= (8192*1024)/PAGE_SIZE)
+ inet_peer_threshold >>= 2; /* about 128KB */
+
+ peer_cachep = kmem_cache_create("inet_peer_cache",
+ sizeof(struct inet_peer),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+
+ if (!peer_cachep)
+ panic("cannot create inet_peer_cache");
+
+ /* All the timers, started at system startup tend
+ to synchronize. Perturb it a bit.
+ */
+ peer_periodic_timer.expires = jiffies
+ + net_random() % inet_peer_gc_maxtime
+ + inet_peer_gc_maxtime;
+ add_timer(&peer_periodic_timer);
+}
+
+/* Called with or without local BH being disabled. */
+static void unlink_from_unused(struct inet_peer *p)
+{
+ spin_lock_bh(&inet_peer_unused_lock);
+ if (p->unused_prevp != NULL) {
+ /* On unused list. */
+ *p->unused_prevp = p->unused_next;
+ if (p->unused_next != NULL)
+ p->unused_next->unused_prevp = p->unused_prevp;
+ else
+ inet_peer_unused_tailp = p->unused_prevp;
+ p->unused_prevp = NULL; /* mark it as removed */
+ }
+ spin_unlock_bh(&inet_peer_unused_lock);
+}
+
+/* Called with local BH disabled and the pool lock held. */
+#define lookup(daddr) \
+({ \
+ struct inet_peer *u, **v; \
+ stackptr = stack; \
+ *stackptr++ = &peer_root; \
+ for (u = peer_root; u != peer_avl_empty; ) { \
+ if (daddr == u->v4daddr) \
+ break; \
+ if (daddr < u->v4daddr) \
+ v = &u->avl_left; \
+ else \
+ v = &u->avl_right; \
+ *stackptr++ = v; \
+ u = *v; \
+ } \
+ u; \
+})
+
+/* Called with local BH disabled and the pool write lock held. */
+#define lookup_rightempty(start) \
+({ \
+ struct inet_peer *u, **v; \
+ *stackptr++ = &start->avl_left; \
+ v = &start->avl_left; \
+ for (u = *v; u->avl_right != peer_avl_empty; ) { \
+ v = &u->avl_right; \
+ *stackptr++ = v; \
+ u = *v; \
+ } \
+ u; \
+})
+
+/* Called with local BH disabled and the pool write lock held.
+ * Variable names are the proof of operation correctness.
+ * Look into mm/map_avl.c for more detail description of the ideas. */
+static void peer_avl_rebalance(struct inet_peer **stack[],
+ struct inet_peer ***stackend)
+{
+ struct inet_peer **nodep, *node, *l, *r;
+ int lh, rh;
+
+ while (stackend > stack) {
+ nodep = *--stackend;
+ node = *nodep;
+ l = node->avl_left;
+ r = node->avl_right;
+ lh = node_height(l);
+ rh = node_height(r);
+ if (lh > rh + 1) { /* l: RH+2 */
+ struct inet_peer *ll, *lr, *lrl, *lrr;
+ int lrh;
+ ll = l->avl_left;
+ lr = l->avl_right;
+ lrh = node_height(lr);
+ if (lrh <= node_height(ll)) { /* ll: RH+1 */
+ node->avl_left = lr; /* lr: RH or RH+1 */
+ node->avl_right = r; /* r: RH */
+ node->avl_height = lrh + 1; /* RH+1 or RH+2 */
+ l->avl_left = ll; /* ll: RH+1 */
+ l->avl_right = node; /* node: RH+1 or RH+2 */
+ l->avl_height = node->avl_height + 1;
+ *nodep = l;
+ } else { /* ll: RH, lr: RH+1 */
+ lrl = lr->avl_left; /* lrl: RH or RH-1 */
+ lrr = lr->avl_right; /* lrr: RH or RH-1 */
+ node->avl_left = lrr; /* lrr: RH or RH-1 */
+ node->avl_right = r; /* r: RH */
+ node->avl_height = rh + 1; /* node: RH+1 */
+ l->avl_left = ll; /* ll: RH */
+ l->avl_right = lrl; /* lrl: RH or RH-1 */
+ l->avl_height = rh + 1; /* l: RH+1 */
+ lr->avl_left = l; /* l: RH+1 */
+ lr->avl_right = node; /* node: RH+1 */
+ lr->avl_height = rh + 2;
+ *nodep = lr;
+ }
+ } else if (rh > lh + 1) { /* r: LH+2 */
+ struct inet_peer *rr, *rl, *rlr, *rll;
+ int rlh;
+ rr = r->avl_right;
+ rl = r->avl_left;
+ rlh = node_height(rl);
+ if (rlh <= node_height(rr)) { /* rr: LH+1 */
+ node->avl_right = rl; /* rl: LH or LH+1 */
+ node->avl_left = l; /* l: LH */
+ node->avl_height = rlh + 1; /* LH+1 or LH+2 */
+ r->avl_right = rr; /* rr: LH+1 */
+ r->avl_left = node; /* node: LH+1 or LH+2 */
+ r->avl_height = node->avl_height + 1;
+ *nodep = r;
+ } else { /* rr: RH, rl: RH+1 */
+ rlr = rl->avl_right; /* rlr: LH or LH-1 */
+ rll = rl->avl_left; /* rll: LH or LH-1 */
+ node->avl_right = rll; /* rll: LH or LH-1 */
+ node->avl_left = l; /* l: LH */
+ node->avl_height = lh + 1; /* node: LH+1 */
+ r->avl_right = rr; /* rr: LH */
+ r->avl_left = rlr; /* rlr: LH or LH-1 */
+ r->avl_height = lh + 1; /* r: LH+1 */
+ rl->avl_right = r; /* r: LH+1 */
+ rl->avl_left = node; /* node: LH+1 */
+ rl->avl_height = lh + 2;
+ *nodep = rl;
+ }
+ } else {
+ node->avl_height = (lh > rh ? lh : rh) + 1;
+ }
+ }
+}
+
+/* Called with local BH disabled and the pool write lock held. */
+#define link_to_pool(n) \
+do { \
+ n->avl_height = 1; \
+ n->avl_left = peer_avl_empty; \
+ n->avl_right = peer_avl_empty; \
+ **--stackptr = n; \
+ peer_avl_rebalance(stack, stackptr); \
+} while(0)
+
+/* May be called with local BH enabled. */
+static void unlink_from_pool(struct inet_peer *p)
+{
+ int do_free;
+
+ do_free = 0;
+
+ write_lock_bh(&peer_pool_lock);
+ /* Check the reference counter. It was artificially incremented by 1
+ * in cleanup() function to prevent sudden disappearing. If the
+ * reference count is still 1 then the node is referenced only as `p'
+ * here and from the pool. So under the exclusive pool lock it's safe
+ * to remove the node and free it later. */
+ if (atomic_read(&p->refcnt) == 1) {
+ struct inet_peer **stack[PEER_MAXDEPTH];
+ struct inet_peer ***stackptr, ***delp;
+ if (lookup(p->v4daddr) != p)
+ BUG();
+ delp = stackptr - 1; /* *delp[0] == p */
+ if (p->avl_left == peer_avl_empty) {
+ *delp[0] = p->avl_right;
+ --stackptr;
+ } else {
+ /* look for a node to insert instead of p */
+ struct inet_peer *t;
+ t = lookup_rightempty(p);
+ if (*stackptr[-1] != t)
+ BUG();
+ **--stackptr = t->avl_left;
+ /* t is removed, t->v4daddr > x->v4daddr for any
+ * x in p->avl_left subtree.
+ * Put t in the old place of p. */
+ *delp[0] = t;
+ t->avl_left = p->avl_left;
+ t->avl_right = p->avl_right;
+ t->avl_height = p->avl_height;
+ if (delp[1] != &p->avl_left)
+ BUG();
+ delp[1] = &t->avl_left; /* was &p->avl_left */
+ }
+ peer_avl_rebalance(stack, stackptr);
+ peer_total--;
+ do_free = 1;
+ }
+ write_unlock_bh(&peer_pool_lock);
+
+ if (do_free)
+ kmem_cache_free(peer_cachep, p);
+ else
+ /* The node is used again. Decrease the reference counter
+ * back. The loop "cleanup -> unlink_from_unused
+ * -> unlink_from_pool -> putpeer -> link_to_unused
+ * -> cleanup (for the same node)"
+ * doesn't really exist because the entry will have a
+ * recent deletion time and will not be cleaned again soon. */
+ inet_putpeer(p);
+}
+
+/* May be called with local BH enabled. */
+static int cleanup_once(unsigned long ttl)
+{
+ struct inet_peer *p;
+
+ /* Remove the first entry from the list of unused nodes. */
+ spin_lock_bh(&inet_peer_unused_lock);
+ p = inet_peer_unused_head;
+ if (p != NULL) {
+ if (time_after(p->dtime + ttl, jiffies)) {
+ /* Do not prune fresh entries. */
+ spin_unlock_bh(&inet_peer_unused_lock);
+ return -1;
+ }
+ inet_peer_unused_head = p->unused_next;
+ if (p->unused_next != NULL)
+ p->unused_next->unused_prevp = p->unused_prevp;
+ else
+ inet_peer_unused_tailp = p->unused_prevp;
+ p->unused_prevp = NULL; /* mark as not on the list */
+ /* Grab an extra reference to prevent node disappearing
+ * before unlink_from_pool() call. */
+ atomic_inc(&p->refcnt);
+ }
+ spin_unlock_bh(&inet_peer_unused_lock);
+
+ if (p == NULL)
+ /* It means that the total number of USED entries has
+ * grown over inet_peer_threshold. It shouldn't really
+ * happen because of entry limits in route cache. */
+ return -1;
+
+ unlink_from_pool(p);
+ return 0;
+}
+
+/* Called with or without local BH being disabled. */
+struct inet_peer *inet_getpeer(__u32 daddr, int create)
+{
+ struct inet_peer *p, *n;
+ struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr;
+
+ /* Look up for the address quickly. */
+ read_lock_bh(&peer_pool_lock);
+ p = lookup(daddr);
+ if (p != peer_avl_empty)
+ atomic_inc(&p->refcnt);
+ read_unlock_bh(&peer_pool_lock);
+
+ if (p != peer_avl_empty) {
+ /* The existing node has been found. */
+ /* Remove the entry from unused list if it was there. */
+ unlink_from_unused(p);
+ return p;
+ }
+
+ if (!create)
+ return NULL;
+
+ /* Allocate the space outside the locked region. */
+ n = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
+ if (n == NULL)
+ return NULL;
+ n->v4daddr = daddr;
+ atomic_set(&n->refcnt, 1);
+ n->ip_id_count = secure_ip_id(daddr);
+ n->tcp_ts_stamp = 0;
+
+ write_lock_bh(&peer_pool_lock);
+ /* Check if an entry has suddenly appeared. */
+ p = lookup(daddr);
+ if (p != peer_avl_empty)
+ goto out_free;
+
+ /* Link the node. */
+ link_to_pool(n);
+ n->unused_prevp = NULL; /* not on the list */
+ peer_total++;
+ write_unlock_bh(&peer_pool_lock);
+
+ if (peer_total >= inet_peer_threshold)
+ /* Remove one less-recently-used entry. */
+ cleanup_once(0);
+
+ return n;
+
+out_free:
+ /* The appropriate node is already in the pool. */
+ atomic_inc(&p->refcnt);
+ write_unlock_bh(&peer_pool_lock);
+ /* Remove the entry from unused list if it was there. */
+ unlink_from_unused(p);
+ /* Free preallocated the preallocated node. */
+ kmem_cache_free(peer_cachep, n);
+ return p;
+}
+
+/* Called with local BH disabled. */
+static void peer_check_expire(unsigned long dummy)
+{
+ int i;
+ int ttl;
+
+ if (peer_total >= inet_peer_threshold)
+ ttl = inet_peer_minttl;
+ else
+ ttl = inet_peer_maxttl
+ - (inet_peer_maxttl - inet_peer_minttl) / HZ *
+ peer_total / inet_peer_threshold * HZ;
+ for (i = 0; i < PEER_MAX_CLEANUP_WORK && !cleanup_once(ttl); i++);
+
+ /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
+ * interval depending on the total number of entries (more entries,
+ * less interval). */
+ peer_periodic_timer.expires = jiffies
+ + inet_peer_gc_maxtime
+ - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
+ peer_total / inet_peer_threshold * HZ;
+ add_timer(&peer_periodic_timer);
+}
+
+EXPORT_SYMBOL(inet_peer_idlock);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
new file mode 100644
index 000000000000..77094aac6c28
--- /dev/null
+++ b/net/ipv4/ip_forward.c
@@ -0,0 +1,127 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The IP forwarding functionality.
+ *
+ * Version: $Id: ip_forward.c,v 1.48 2000/12/13 18:31:48 davem Exp $
+ *
+ * Authors: see ip.c
+ *
+ * Fixes:
+ * Many : Split from ip.c , see ip_input.c for
+ * history.
+ * Dave Gregorich : NULL ip_rt_put fix for multicast
+ * routing.
+ * Jos Vos : Add call_out_firewall before sending,
+ * use output device for accounting.
+ * Jos Vos : Call forward firewall after routing
+ * (always use output device).
+ * Mike McLagan : Routing by source
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/checksum.h>
+#include <linux/route.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+
+static inline int ip_forward_finish(struct sk_buff *skb)
+{
+ struct ip_options * opt = &(IPCB(skb)->opt);
+
+ IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
+
+ if (unlikely(opt->optlen))
+ ip_forward_options(skb);
+
+ return dst_output(skb);
+}
+
+int ip_forward(struct sk_buff *skb)
+{
+ struct iphdr *iph; /* Our header */
+ struct rtable *rt; /* Route we use */
+ struct ip_options * opt = &(IPCB(skb)->opt);
+
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
+ goto drop;
+
+ if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
+ return NET_RX_SUCCESS;
+
+ if (skb->pkt_type != PACKET_HOST)
+ goto drop;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ /*
+ * According to the RFC, we must first decrease the TTL field. If
+ * that reaches zero, we must reply an ICMP control message telling
+ * that the packet's lifetime expired.
+ */
+
+ iph = skb->nh.iph;
+
+ if (iph->ttl <= 1)
+ goto too_many_hops;
+
+ if (!xfrm4_route_forward(skb))
+ goto drop;
+
+ iph = skb->nh.iph;
+ rt = (struct rtable*)skb->dst;
+
+ if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
+ goto sr_failed;
+
+ /* We are about to mangle packet. Copy it! */
+ if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len))
+ goto drop;
+ iph = skb->nh.iph;
+
+ /* Decrease ttl after skb cow done */
+ ip_decrease_ttl(iph);
+
+ /*
+ * We now generate an ICMP HOST REDIRECT giving the route
+ * we calculated.
+ */
+ if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr)
+ ip_rt_send_redirect(skb);
+
+ skb->priority = rt_tos2priority(iph->tos);
+
+ return NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, rt->u.dst.dev,
+ ip_forward_finish);
+
+sr_failed:
+ /*
+ * Strict routing permits no gatewaying
+ */
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
+ goto drop;
+
+too_many_hops:
+ /* Tell the sender its packet died... */
+ icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
new file mode 100644
index 000000000000..7f68e27eb4ea
--- /dev/null
+++ b/net/ipv4/ip_fragment.c
@@ -0,0 +1,691 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The IP fragmentation functionality.
+ *
+ * Version: $Id: ip_fragment.c,v 1.59 2002/01/12 07:54:56 davem Exp $
+ *
+ * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
+ * Alan Cox <Alan.Cox@linux.org>
+ *
+ * Fixes:
+ * Alan Cox : Split from ip.c , see ip_input.c for history.
+ * David S. Miller : Begin massive cleanup...
+ * Andi Kleen : Add sysctls.
+ * xxxx : Overlapfrag bug.
+ * Ultima : ip_expire() kernel panic.
+ * Bill Hawes : Frag accounting and evictor fixes.
+ * John McDonald : 0 length frag bug.
+ * Alexey Kuznetsov: SMP races, threading, cleanup.
+ * Patrick McHardy : LRU queue of frag heads for evictor.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/jiffies.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/checksum.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/inet.h>
+#include <linux/netfilter_ipv4.h>
+
+/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
+ * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
+ * as well. Or notify me, at least. --ANK
+ */
+
+/* Fragment cache limits. We will commit 256K at one time. Should we
+ * cross that limit we will prune down to 192K. This should cope with
+ * even the most extreme cases without allowing an attacker to measurably
+ * harm machine performance.
+ */
+int sysctl_ipfrag_high_thresh = 256*1024;
+int sysctl_ipfrag_low_thresh = 192*1024;
+
+/* Important NOTE! Fragment queue must be destroyed before MSL expires.
+ * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL.
+ */
+int sysctl_ipfrag_time = IP_FRAG_TIME;
+
+struct ipfrag_skb_cb
+{
+ struct inet_skb_parm h;
+ int offset;
+};
+
+#define FRAG_CB(skb) ((struct ipfrag_skb_cb*)((skb)->cb))
+
+/* Describe an entry in the "incomplete datagrams" queue. */
+struct ipq {
+ struct ipq *next; /* linked list pointers */
+ struct list_head lru_list; /* lru list member */
+ u32 user;
+ u32 saddr;
+ u32 daddr;
+ u16 id;
+ u8 protocol;
+ u8 last_in;
+#define COMPLETE 4
+#define FIRST_IN 2
+#define LAST_IN 1
+
+ struct sk_buff *fragments; /* linked list of received fragments */
+ int len; /* total length of original datagram */
+ int meat;
+ spinlock_t lock;
+ atomic_t refcnt;
+ struct timer_list timer; /* when will this queue expire? */
+ struct ipq **pprev;
+ int iif;
+ struct timeval stamp;
+};
+
+/* Hash table. */
+
+#define IPQ_HASHSZ 64
+
+/* Per-bucket lock is easy to add now. */
+static struct ipq *ipq_hash[IPQ_HASHSZ];
+static DEFINE_RWLOCK(ipfrag_lock);
+static u32 ipfrag_hash_rnd;
+static LIST_HEAD(ipq_lru_list);
+int ip_frag_nqueues = 0;
+
+static __inline__ void __ipq_unlink(struct ipq *qp)
+{
+ if(qp->next)
+ qp->next->pprev = qp->pprev;
+ *qp->pprev = qp->next;
+ list_del(&qp->lru_list);
+ ip_frag_nqueues--;
+}
+
+static __inline__ void ipq_unlink(struct ipq *ipq)
+{
+ write_lock(&ipfrag_lock);
+ __ipq_unlink(ipq);
+ write_unlock(&ipfrag_lock);
+}
+
+static unsigned int ipqhashfn(u16 id, u32 saddr, u32 daddr, u8 prot)
+{
+ return jhash_3words((u32)id << 16 | prot, saddr, daddr,
+ ipfrag_hash_rnd) & (IPQ_HASHSZ - 1);
+}
+
+static struct timer_list ipfrag_secret_timer;
+int sysctl_ipfrag_secret_interval = 10 * 60 * HZ;
+
+static void ipfrag_secret_rebuild(unsigned long dummy)
+{
+ unsigned long now = jiffies;
+ int i;
+
+ write_lock(&ipfrag_lock);
+ get_random_bytes(&ipfrag_hash_rnd, sizeof(u32));
+ for (i = 0; i < IPQ_HASHSZ; i++) {
+ struct ipq *q;
+
+ q = ipq_hash[i];
+ while (q) {
+ struct ipq *next = q->next;
+ unsigned int hval = ipqhashfn(q->id, q->saddr,
+ q->daddr, q->protocol);
+
+ if (hval != i) {
+ /* Unlink. */
+ if (q->next)
+ q->next->pprev = q->pprev;
+ *q->pprev = q->next;
+
+ /* Relink to new hash chain. */
+ if ((q->next = ipq_hash[hval]) != NULL)
+ q->next->pprev = &q->next;
+ ipq_hash[hval] = q;
+ q->pprev = &ipq_hash[hval];
+ }
+
+ q = next;
+ }
+ }
+ write_unlock(&ipfrag_lock);
+
+ mod_timer(&ipfrag_secret_timer, now + sysctl_ipfrag_secret_interval);
+}
+
+atomic_t ip_frag_mem = ATOMIC_INIT(0); /* Memory used for fragments */
+
+/* Memory Tracking Functions. */
+static __inline__ void frag_kfree_skb(struct sk_buff *skb, int *work)
+{
+ if (work)
+ *work -= skb->truesize;
+ atomic_sub(skb->truesize, &ip_frag_mem);
+ kfree_skb(skb);
+}
+
+static __inline__ void frag_free_queue(struct ipq *qp, int *work)
+{
+ if (work)
+ *work -= sizeof(struct ipq);
+ atomic_sub(sizeof(struct ipq), &ip_frag_mem);
+ kfree(qp);
+}
+
+static __inline__ struct ipq *frag_alloc_queue(void)
+{
+ struct ipq *qp = kmalloc(sizeof(struct ipq), GFP_ATOMIC);
+
+ if(!qp)
+ return NULL;
+ atomic_add(sizeof(struct ipq), &ip_frag_mem);
+ return qp;
+}
+
+
+/* Destruction primitives. */
+
+/* Complete destruction of ipq. */
+static void ip_frag_destroy(struct ipq *qp, int *work)
+{
+ struct sk_buff *fp;
+
+ BUG_TRAP(qp->last_in&COMPLETE);
+ BUG_TRAP(del_timer(&qp->timer) == 0);
+
+ /* Release all fragment data. */
+ fp = qp->fragments;
+ while (fp) {
+ struct sk_buff *xp = fp->next;
+
+ frag_kfree_skb(fp, work);
+ fp = xp;
+ }
+
+ /* Finally, release the queue descriptor itself. */
+ frag_free_queue(qp, work);
+}
+
+static __inline__ void ipq_put(struct ipq *ipq, int *work)
+{
+ if (atomic_dec_and_test(&ipq->refcnt))
+ ip_frag_destroy(ipq, work);
+}
+
+/* Kill ipq entry. It is not destroyed immediately,
+ * because caller (and someone more) holds reference count.
+ */
+static void ipq_kill(struct ipq *ipq)
+{
+ if (del_timer(&ipq->timer))
+ atomic_dec(&ipq->refcnt);
+
+ if (!(ipq->last_in & COMPLETE)) {
+ ipq_unlink(ipq);
+ atomic_dec(&ipq->refcnt);
+ ipq->last_in |= COMPLETE;
+ }
+}
+
+/* Memory limiting on fragments. Evictor trashes the oldest
+ * fragment queue until we are back under the threshold.
+ */
+static void ip_evictor(void)
+{
+ struct ipq *qp;
+ struct list_head *tmp;
+ int work;
+
+ work = atomic_read(&ip_frag_mem) - sysctl_ipfrag_low_thresh;
+ if (work <= 0)
+ return;
+
+ while (work > 0) {
+ read_lock(&ipfrag_lock);
+ if (list_empty(&ipq_lru_list)) {
+ read_unlock(&ipfrag_lock);
+ return;
+ }
+ tmp = ipq_lru_list.next;
+ qp = list_entry(tmp, struct ipq, lru_list);
+ atomic_inc(&qp->refcnt);
+ read_unlock(&ipfrag_lock);
+
+ spin_lock(&qp->lock);
+ if (!(qp->last_in&COMPLETE))
+ ipq_kill(qp);
+ spin_unlock(&qp->lock);
+
+ ipq_put(qp, &work);
+ IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+ }
+}
+
+/*
+ * Oops, a fragment queue timed out. Kill it and send an ICMP reply.
+ */
+static void ip_expire(unsigned long arg)
+{
+ struct ipq *qp = (struct ipq *) arg;
+
+ spin_lock(&qp->lock);
+
+ if (qp->last_in & COMPLETE)
+ goto out;
+
+ ipq_kill(qp);
+
+ IP_INC_STATS_BH(IPSTATS_MIB_REASMTIMEOUT);
+ IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+
+ if ((qp->last_in&FIRST_IN) && qp->fragments != NULL) {
+ struct sk_buff *head = qp->fragments;
+ /* Send an ICMP "Fragment Reassembly Timeout" message. */
+ if ((head->dev = dev_get_by_index(qp->iif)) != NULL) {
+ icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
+ dev_put(head->dev);
+ }
+ }
+out:
+ spin_unlock(&qp->lock);
+ ipq_put(qp, NULL);
+}
+
+/* Creation primitives. */
+
+static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in)
+{
+ struct ipq *qp;
+
+ write_lock(&ipfrag_lock);
+#ifdef CONFIG_SMP
+ /* With SMP race we have to recheck hash table, because
+ * such entry could be created on other cpu, while we
+ * promoted read lock to write lock.
+ */
+ for(qp = ipq_hash[hash]; qp; qp = qp->next) {
+ if(qp->id == qp_in->id &&
+ qp->saddr == qp_in->saddr &&
+ qp->daddr == qp_in->daddr &&
+ qp->protocol == qp_in->protocol &&
+ qp->user == qp_in->user) {
+ atomic_inc(&qp->refcnt);
+ write_unlock(&ipfrag_lock);
+ qp_in->last_in |= COMPLETE;
+ ipq_put(qp_in, NULL);
+ return qp;
+ }
+ }
+#endif
+ qp = qp_in;
+
+ if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time))
+ atomic_inc(&qp->refcnt);
+
+ atomic_inc(&qp->refcnt);
+ if((qp->next = ipq_hash[hash]) != NULL)
+ qp->next->pprev = &qp->next;
+ ipq_hash[hash] = qp;
+ qp->pprev = &ipq_hash[hash];
+ INIT_LIST_HEAD(&qp->lru_list);
+ list_add_tail(&qp->lru_list, &ipq_lru_list);
+ ip_frag_nqueues++;
+ write_unlock(&ipfrag_lock);
+ return qp;
+}
+
+/* Add an entry to the 'ipq' queue for a newly received IP datagram. */
+static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
+{
+ struct ipq *qp;
+
+ if ((qp = frag_alloc_queue()) == NULL)
+ goto out_nomem;
+
+ qp->protocol = iph->protocol;
+ qp->last_in = 0;
+ qp->id = iph->id;
+ qp->saddr = iph->saddr;
+ qp->daddr = iph->daddr;
+ qp->user = user;
+ qp->len = 0;
+ qp->meat = 0;
+ qp->fragments = NULL;
+ qp->iif = 0;
+
+ /* Initialize a timer for this entry. */
+ init_timer(&qp->timer);
+ qp->timer.data = (unsigned long) qp; /* pointer to queue */
+ qp->timer.function = ip_expire; /* expire function */
+ spin_lock_init(&qp->lock);
+ atomic_set(&qp->refcnt, 1);
+
+ return ip_frag_intern(hash, qp);
+
+out_nomem:
+ NETDEBUG(if (net_ratelimit()) printk(KERN_ERR "ip_frag_create: no memory left !\n"));
+ return NULL;
+}
+
+/* Find the correct entry in the "incomplete datagrams" queue for
+ * this IP datagram, and create new one, if nothing is found.
+ */
+static inline struct ipq *ip_find(struct iphdr *iph, u32 user)
+{
+ __u16 id = iph->id;
+ __u32 saddr = iph->saddr;
+ __u32 daddr = iph->daddr;
+ __u8 protocol = iph->protocol;
+ unsigned int hash = ipqhashfn(id, saddr, daddr, protocol);
+ struct ipq *qp;
+
+ read_lock(&ipfrag_lock);
+ for(qp = ipq_hash[hash]; qp; qp = qp->next) {
+ if(qp->id == id &&
+ qp->saddr == saddr &&
+ qp->daddr == daddr &&
+ qp->protocol == protocol &&
+ qp->user == user) {
+ atomic_inc(&qp->refcnt);
+ read_unlock(&ipfrag_lock);
+ return qp;
+ }
+ }
+ read_unlock(&ipfrag_lock);
+
+ return ip_frag_create(hash, iph, user);
+}
+
+/* Add new segment to existing queue. */
+static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+{
+ struct sk_buff *prev, *next;
+ int flags, offset;
+ int ihl, end;
+
+ if (qp->last_in & COMPLETE)
+ goto err;
+
+ offset = ntohs(skb->nh.iph->frag_off);
+ flags = offset & ~IP_OFFSET;
+ offset &= IP_OFFSET;
+ offset <<= 3; /* offset is in 8-byte chunks */
+ ihl = skb->nh.iph->ihl * 4;
+
+ /* Determine the position of this fragment. */
+ end = offset + skb->len - ihl;
+
+ /* Is this the final fragment? */
+ if ((flags & IP_MF) == 0) {
+ /* If we already have some bits beyond end
+ * or have different end, the segment is corrrupted.
+ */
+ if (end < qp->len ||
+ ((qp->last_in & LAST_IN) && end != qp->len))
+ goto err;
+ qp->last_in |= LAST_IN;
+ qp->len = end;
+ } else {
+ if (end&7) {
+ end &= ~7;
+ if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+ skb->ip_summed = CHECKSUM_NONE;
+ }
+ if (end > qp->len) {
+ /* Some bits beyond end -> corruption. */
+ if (qp->last_in & LAST_IN)
+ goto err;
+ qp->len = end;
+ }
+ }
+ if (end == offset)
+ goto err;
+
+ if (pskb_pull(skb, ihl) == NULL)
+ goto err;
+ if (pskb_trim(skb, end-offset))
+ goto err;
+
+ /* Find out which fragments are in front and at the back of us
+ * in the chain of fragments so far. We must know where to put
+ * this fragment, right?
+ */
+ prev = NULL;
+ for(next = qp->fragments; next != NULL; next = next->next) {
+ if (FRAG_CB(next)->offset >= offset)
+ break; /* bingo! */
+ prev = next;
+ }
+
+ /* We found where to put this one. Check for overlap with
+ * preceding fragment, and, if needed, align things so that
+ * any overlaps are eliminated.
+ */
+ if (prev) {
+ int i = (FRAG_CB(prev)->offset + prev->len) - offset;
+
+ if (i > 0) {
+ offset += i;
+ if (end <= offset)
+ goto err;
+ if (!pskb_pull(skb, i))
+ goto err;
+ if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+ skb->ip_summed = CHECKSUM_NONE;
+ }
+ }
+
+ while (next && FRAG_CB(next)->offset < end) {
+ int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
+
+ if (i < next->len) {
+ /* Eat head of the next overlapped fragment
+ * and leave the loop. The next ones cannot overlap.
+ */
+ if (!pskb_pull(next, i))
+ goto err;
+ FRAG_CB(next)->offset += i;
+ qp->meat -= i;
+ if (next->ip_summed != CHECKSUM_UNNECESSARY)
+ next->ip_summed = CHECKSUM_NONE;
+ break;
+ } else {
+ struct sk_buff *free_it = next;
+
+ /* Old fragmnet is completely overridden with
+ * new one drop it.
+ */
+ next = next->next;
+
+ if (prev)
+ prev->next = next;
+ else
+ qp->fragments = next;
+
+ qp->meat -= free_it->len;
+ frag_kfree_skb(free_it, NULL);
+ }
+ }
+
+ FRAG_CB(skb)->offset = offset;
+
+ /* Insert this fragment in the chain of fragments. */
+ skb->next = next;
+ if (prev)
+ prev->next = skb;
+ else
+ qp->fragments = skb;
+
+ if (skb->dev)
+ qp->iif = skb->dev->ifindex;
+ skb->dev = NULL;
+ qp->stamp = skb->stamp;
+ qp->meat += skb->len;
+ atomic_add(skb->truesize, &ip_frag_mem);
+ if (offset == 0)
+ qp->last_in |= FIRST_IN;
+
+ write_lock(&ipfrag_lock);
+ list_move_tail(&qp->lru_list, &ipq_lru_list);
+ write_unlock(&ipfrag_lock);
+
+ return;
+
+err:
+ kfree_skb(skb);
+}
+
+
+/* Build a new IP datagram from all its fragments. */
+
+static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
+{
+ struct iphdr *iph;
+ struct sk_buff *fp, *head = qp->fragments;
+ int len;
+ int ihlen;
+
+ ipq_kill(qp);
+
+ BUG_TRAP(head != NULL);
+ BUG_TRAP(FRAG_CB(head)->offset == 0);
+
+ /* Allocate a new buffer for the datagram. */
+ ihlen = head->nh.iph->ihl*4;
+ len = ihlen + qp->len;
+
+ if(len > 65535)
+ goto out_oversize;
+
+ /* Head of list must not be cloned. */
+ if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC))
+ goto out_nomem;
+
+ /* If the first fragment is fragmented itself, we split
+ * it to two chunks: the first with data and paged part
+ * and the second, holding only fragments. */
+ if (skb_shinfo(head)->frag_list) {
+ struct sk_buff *clone;
+ int i, plen = 0;
+
+ if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL)
+ goto out_nomem;
+ clone->next = head->next;
+ head->next = clone;
+ skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
+ skb_shinfo(head)->frag_list = NULL;
+ for (i=0; i<skb_shinfo(head)->nr_frags; i++)
+ plen += skb_shinfo(head)->frags[i].size;
+ clone->len = clone->data_len = head->data_len - plen;
+ head->data_len -= clone->len;
+ head->len -= clone->len;
+ clone->csum = 0;
+ clone->ip_summed = head->ip_summed;
+ atomic_add(clone->truesize, &ip_frag_mem);
+ }
+
+ skb_shinfo(head)->frag_list = head->next;
+ skb_push(head, head->data - head->nh.raw);
+ atomic_sub(head->truesize, &ip_frag_mem);
+
+ for (fp=head->next; fp; fp = fp->next) {
+ head->data_len += fp->len;
+ head->len += fp->len;
+ if (head->ip_summed != fp->ip_summed)
+ head->ip_summed = CHECKSUM_NONE;
+ else if (head->ip_summed == CHECKSUM_HW)
+ head->csum = csum_add(head->csum, fp->csum);
+ head->truesize += fp->truesize;
+ atomic_sub(fp->truesize, &ip_frag_mem);
+ }
+
+ head->next = NULL;
+ head->dev = dev;
+ head->stamp = qp->stamp;
+
+ iph = head->nh.iph;
+ iph->frag_off = 0;
+ iph->tot_len = htons(len);
+ IP_INC_STATS_BH(IPSTATS_MIB_REASMOKS);
+ qp->fragments = NULL;
+ return head;
+
+out_nomem:
+ NETDEBUG(if (net_ratelimit())
+ printk(KERN_ERR
+ "IP: queue_glue: no memory for gluing queue %p\n",
+ qp));
+ goto out_fail;
+out_oversize:
+ if (net_ratelimit())
+ printk(KERN_INFO
+ "Oversized IP packet from %d.%d.%d.%d.\n",
+ NIPQUAD(qp->saddr));
+out_fail:
+ IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+ return NULL;
+}
+
+/* Process an incoming IP datagram fragment. */
+struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user)
+{
+ struct iphdr *iph = skb->nh.iph;
+ struct ipq *qp;
+ struct net_device *dev;
+
+ IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS);
+
+ /* Start by cleaning up the memory. */
+ if (atomic_read(&ip_frag_mem) > sysctl_ipfrag_high_thresh)
+ ip_evictor();
+
+ dev = skb->dev;
+
+ /* Lookup (or create) queue header */
+ if ((qp = ip_find(iph, user)) != NULL) {
+ struct sk_buff *ret = NULL;
+
+ spin_lock(&qp->lock);
+
+ ip_frag_queue(qp, skb);
+
+ if (qp->last_in == (FIRST_IN|LAST_IN) &&
+ qp->meat == qp->len)
+ ret = ip_frag_reasm(qp, dev);
+
+ spin_unlock(&qp->lock);
+ ipq_put(qp, NULL);
+ return ret;
+ }
+
+ IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+ kfree_skb(skb);
+ return NULL;
+}
+
+void ipfrag_init(void)
+{
+ ipfrag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
+ (jiffies ^ (jiffies >> 6)));
+
+ init_timer(&ipfrag_secret_timer);
+ ipfrag_secret_timer.function = ipfrag_secret_rebuild;
+ ipfrag_secret_timer.expires = jiffies + sysctl_ipfrag_secret_interval;
+ add_timer(&ipfrag_secret_timer);
+}
+
+EXPORT_SYMBOL(ip_defrag);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
new file mode 100644
index 000000000000..884835522224
--- /dev/null
+++ b/net/ipv4/ip_gre.c
@@ -0,0 +1,1290 @@
+/*
+ * Linux NET3: GRE over IP protocol decoder.
+ *
+ * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ipip.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+
+#ifdef CONFIG_IPV6
+#include <net/ipv6.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#endif
+
+/*
+ Problems & solutions
+ --------------------
+
+ 1. The most important issue is detecting local dead loops.
+ They would cause complete host lockup in transmit, which
+ would be "resolved" by stack overflow or, if queueing is enabled,
+ with infinite looping in net_bh.
+
+ We cannot track such dead loops during route installation,
+ it is infeasible task. The most general solutions would be
+ to keep skb->encapsulation counter (sort of local ttl),
+ and silently drop packet when it expires. It is the best
+ solution, but it supposes maintaing new variable in ALL
+ skb, even if no tunneling is used.
+
+ Current solution: t->recursion lock breaks dead loops. It looks
+ like dev->tbusy flag, but I preferred new variable, because
+ the semantics is different. One day, when hard_start_xmit
+ will be multithreaded we will have to use skb->encapsulation.
+
+
+
+ 2. Networking dead loops would not kill routers, but would really
+ kill network. IP hop limit plays role of "t->recursion" in this case,
+ if we copy it from packet being encapsulated to upper header.
+ It is very good solution, but it introduces two problems:
+
+ - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
+ do not work over tunnels.
+ - traceroute does not work. I planned to relay ICMP from tunnel,
+ so that this problem would be solved and traceroute output
+ would even more informative. This idea appeared to be wrong:
+ only Linux complies to rfc1812 now (yes, guys, Linux is the only
+ true router now :-)), all routers (at least, in neighbourhood of mine)
+ return only 8 bytes of payload. It is the end.
+
+ Hence, if we want that OSPF worked or traceroute said something reasonable,
+ we should search for another solution.
+
+ One of them is to parse packet trying to detect inner encapsulation
+ made by our node. It is difficult or even impossible, especially,
+ taking into account fragmentation. TO be short, tt is not solution at all.
+
+ Current solution: The solution was UNEXPECTEDLY SIMPLE.
+ We force DF flag on tunnels with preconfigured hop limit,
+ that is ALL. :-) Well, it does not remove the problem completely,
+ but exponential growth of network traffic is changed to linear
+ (branches, that exceed pmtu are pruned) and tunnel mtu
+ fastly degrades to value <68, where looping stops.
+ Yes, it is not good if there exists a router in the loop,
+ which does not force DF, even when encapsulating packets have DF set.
+ But it is not our problem! Nobody could accuse us, we made
+ all that we could make. Even if it is your gated who injected
+ fatal route to network, even if it were you who configured
+ fatal static route: you are innocent. :-)
+
+
+
+ 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
+ practically identical code. It would be good to glue them
+ together, but it is not very evident, how to make them modular.
+ sit is integral part of IPv6, ipip and gre are naturally modular.
+ We could extract common parts (hash table, ioctl etc)
+ to a separate module (ip_tunnel.c).
+
+ Alexey Kuznetsov.
+ */
+
+static int ipgre_tunnel_init(struct net_device *dev);
+static void ipgre_tunnel_setup(struct net_device *dev);
+
+/* Fallback tunnel: no source, no destination, no key, no options */
+
+static int ipgre_fb_tunnel_init(struct net_device *dev);
+
+static struct net_device *ipgre_fb_tunnel_dev;
+
+/* Tunnel hash table */
+
+/*
+ 4 hash tables:
+
+ 3: (remote,local)
+ 2: (remote,*)
+ 1: (*,local)
+ 0: (*,*)
+
+ We require exact key match i.e. if a key is present in packet
+ it will match only tunnel with the same key; if it is not present,
+ it will match only keyless tunnel.
+
+ All keysless packets, if not matched configured keyless tunnels
+ will match fallback tunnel.
+ */
+
+#define HASH_SIZE 16
+#define HASH(addr) ((addr^(addr>>4))&0xF)
+
+static struct ip_tunnel *tunnels[4][HASH_SIZE];
+
+#define tunnels_r_l (tunnels[3])
+#define tunnels_r (tunnels[2])
+#define tunnels_l (tunnels[1])
+#define tunnels_wc (tunnels[0])
+
+static DEFINE_RWLOCK(ipgre_lock);
+
+/* Given src, dst and key, find appropriate for input tunnel. */
+
+static struct ip_tunnel * ipgre_tunnel_lookup(u32 remote, u32 local, u32 key)
+{
+ unsigned h0 = HASH(remote);
+ unsigned h1 = HASH(key);
+ struct ip_tunnel *t;
+
+ for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
+ if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
+ if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
+ return t;
+ }
+ }
+ for (t = tunnels_r[h0^h1]; t; t = t->next) {
+ if (remote == t->parms.iph.daddr) {
+ if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
+ return t;
+ }
+ }
+ for (t = tunnels_l[h1]; t; t = t->next) {
+ if (local == t->parms.iph.saddr ||
+ (local == t->parms.iph.daddr && MULTICAST(local))) {
+ if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
+ return t;
+ }
+ }
+ for (t = tunnels_wc[h1]; t; t = t->next) {
+ if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
+ return t;
+ }
+
+ if (ipgre_fb_tunnel_dev->flags&IFF_UP)
+ return ipgre_fb_tunnel_dev->priv;
+ return NULL;
+}
+
+static struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t)
+{
+ u32 remote = t->parms.iph.daddr;
+ u32 local = t->parms.iph.saddr;
+ u32 key = t->parms.i_key;
+ unsigned h = HASH(key);
+ int prio = 0;
+
+ if (local)
+ prio |= 1;
+ if (remote && !MULTICAST(remote)) {
+ prio |= 2;
+ h ^= HASH(remote);
+ }
+
+ return &tunnels[prio][h];
+}
+
+static void ipgre_tunnel_link(struct ip_tunnel *t)
+{
+ struct ip_tunnel **tp = ipgre_bucket(t);
+
+ t->next = *tp;
+ write_lock_bh(&ipgre_lock);
+ *tp = t;
+ write_unlock_bh(&ipgre_lock);
+}
+
+static void ipgre_tunnel_unlink(struct ip_tunnel *t)
+{
+ struct ip_tunnel **tp;
+
+ for (tp = ipgre_bucket(t); *tp; tp = &(*tp)->next) {
+ if (t == *tp) {
+ write_lock_bh(&ipgre_lock);
+ *tp = t->next;
+ write_unlock_bh(&ipgre_lock);
+ break;
+ }
+ }
+}
+
+static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int create)
+{
+ u32 remote = parms->iph.daddr;
+ u32 local = parms->iph.saddr;
+ u32 key = parms->i_key;
+ struct ip_tunnel *t, **tp, *nt;
+ struct net_device *dev;
+ unsigned h = HASH(key);
+ int prio = 0;
+ char name[IFNAMSIZ];
+
+ if (local)
+ prio |= 1;
+ if (remote && !MULTICAST(remote)) {
+ prio |= 2;
+ h ^= HASH(remote);
+ }
+ for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
+ if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
+ if (key == t->parms.i_key)
+ return t;
+ }
+ }
+ if (!create)
+ return NULL;
+
+ if (parms->name[0])
+ strlcpy(name, parms->name, IFNAMSIZ);
+ else {
+ int i;
+ for (i=1; i<100; i++) {
+ sprintf(name, "gre%d", i);
+ if (__dev_get_by_name(name) == NULL)
+ break;
+ }
+ if (i==100)
+ goto failed;
+ }
+
+ dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
+ if (!dev)
+ return NULL;
+
+ dev->init = ipgre_tunnel_init;
+ nt = dev->priv;
+ nt->parms = *parms;
+
+ if (register_netdevice(dev) < 0) {
+ free_netdev(dev);
+ goto failed;
+ }
+
+ nt = dev->priv;
+ nt->parms = *parms;
+
+ dev_hold(dev);
+ ipgre_tunnel_link(nt);
+ /* Do not decrement MOD_USE_COUNT here. */
+ return nt;
+
+failed:
+ return NULL;
+}
+
+static void ipgre_tunnel_uninit(struct net_device *dev)
+{
+ ipgre_tunnel_unlink((struct ip_tunnel*)dev->priv);
+ dev_put(dev);
+}
+
+
+static void ipgre_err(struct sk_buff *skb, u32 info)
+{
+#ifndef I_WISH_WORLD_WERE_PERFECT
+
+/* It is not :-( All the routers (except for Linux) return only
+ 8 bytes of packet payload. It means, that precise relaying of
+ ICMP in the real Internet is absolutely infeasible.
+
+ Moreover, Cisco "wise men" put GRE key to the third word
+ in GRE header. It makes impossible maintaining even soft state for keyed
+ GRE tunnels with enabled checksum. Tell them "thank you".
+
+ Well, I wonder, rfc1812 was written by Cisco employee,
+ what the hell these idiots break standrads established
+ by themself???
+ */
+
+ struct iphdr *iph = (struct iphdr*)skb->data;
+ u16 *p = (u16*)(skb->data+(iph->ihl<<2));
+ int grehlen = (iph->ihl<<2) + 4;
+ int type = skb->h.icmph->type;
+ int code = skb->h.icmph->code;
+ struct ip_tunnel *t;
+ u16 flags;
+
+ flags = p[0];
+ if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
+ if (flags&(GRE_VERSION|GRE_ROUTING))
+ return;
+ if (flags&GRE_KEY) {
+ grehlen += 4;
+ if (flags&GRE_CSUM)
+ grehlen += 4;
+ }
+ }
+
+ /* If only 8 bytes returned, keyed message will be dropped here */
+ if (skb_headlen(skb) < grehlen)
+ return;
+
+ switch (type) {
+ default:
+ case ICMP_PARAMETERPROB:
+ return;
+
+ case ICMP_DEST_UNREACH:
+ switch (code) {
+ case ICMP_SR_FAILED:
+ case ICMP_PORT_UNREACH:
+ /* Impossible event. */
+ return;
+ case ICMP_FRAG_NEEDED:
+ /* Soft state for pmtu is maintained by IP core. */
+ return;
+ default:
+ /* All others are translated to HOST_UNREACH.
+ rfc2003 contains "deep thoughts" about NET_UNREACH,
+ I believe they are just ether pollution. --ANK
+ */
+ break;
+ }
+ break;
+ case ICMP_TIME_EXCEEDED:
+ if (code != ICMP_EXC_TTL)
+ return;
+ break;
+ }
+
+ read_lock(&ipgre_lock);
+ t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((u32*)p) + (grehlen>>2) - 1) : 0);
+ if (t == NULL || t->parms.iph.daddr == 0 || MULTICAST(t->parms.iph.daddr))
+ goto out;
+
+ if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+ goto out;
+
+ if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
+ t->err_count++;
+ else
+ t->err_count = 1;
+ t->err_time = jiffies;
+out:
+ read_unlock(&ipgre_lock);
+ return;
+#else
+ struct iphdr *iph = (struct iphdr*)dp;
+ struct iphdr *eiph;
+ u16 *p = (u16*)(dp+(iph->ihl<<2));
+ int type = skb->h.icmph->type;
+ int code = skb->h.icmph->code;
+ int rel_type = 0;
+ int rel_code = 0;
+ int rel_info = 0;
+ u16 flags;
+ int grehlen = (iph->ihl<<2) + 4;
+ struct sk_buff *skb2;
+ struct flowi fl;
+ struct rtable *rt;
+
+ if (p[1] != htons(ETH_P_IP))
+ return;
+
+ flags = p[0];
+ if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
+ if (flags&(GRE_VERSION|GRE_ROUTING))
+ return;
+ if (flags&GRE_CSUM)
+ grehlen += 4;
+ if (flags&GRE_KEY)
+ grehlen += 4;
+ if (flags&GRE_SEQ)
+ grehlen += 4;
+ }
+ if (len < grehlen + sizeof(struct iphdr))
+ return;
+ eiph = (struct iphdr*)(dp + grehlen);
+
+ switch (type) {
+ default:
+ return;
+ case ICMP_PARAMETERPROB:
+ if (skb->h.icmph->un.gateway < (iph->ihl<<2))
+ return;
+
+ /* So... This guy found something strange INSIDE encapsulated
+ packet. Well, he is fool, but what can we do ?
+ */
+ rel_type = ICMP_PARAMETERPROB;
+ rel_info = skb->h.icmph->un.gateway - grehlen;
+ break;
+
+ case ICMP_DEST_UNREACH:
+ switch (code) {
+ case ICMP_SR_FAILED:
+ case ICMP_PORT_UNREACH:
+ /* Impossible event. */
+ return;
+ case ICMP_FRAG_NEEDED:
+ /* And it is the only really necessary thing :-) */
+ rel_info = ntohs(skb->h.icmph->un.frag.mtu);
+ if (rel_info < grehlen+68)
+ return;
+ rel_info -= grehlen;
+ /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
+ if (rel_info > ntohs(eiph->tot_len))
+ return;
+ break;
+ default:
+ /* All others are translated to HOST_UNREACH.
+ rfc2003 contains "deep thoughts" about NET_UNREACH,
+ I believe, it is just ether pollution. --ANK
+ */
+ rel_type = ICMP_DEST_UNREACH;
+ rel_code = ICMP_HOST_UNREACH;
+ break;
+ }
+ break;
+ case ICMP_TIME_EXCEEDED:
+ if (code != ICMP_EXC_TTL)
+ return;
+ break;
+ }
+
+ /* Prepare fake skb to feed it to icmp_send */
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2 == NULL)
+ return;
+ dst_release(skb2->dst);
+ skb2->dst = NULL;
+ skb_pull(skb2, skb->data - (u8*)eiph);
+ skb2->nh.raw = skb2->data;
+
+ /* Try to guess incoming interface */
+ memset(&fl, 0, sizeof(fl));
+ fl.fl4_dst = eiph->saddr;
+ fl.fl4_tos = RT_TOS(eiph->tos);
+ fl.proto = IPPROTO_GRE;
+ if (ip_route_output_key(&rt, &fl)) {
+ kfree_skb(skb2);
+ return;
+ }
+ skb2->dev = rt->u.dst.dev;
+
+ /* route "incoming" packet */
+ if (rt->rt_flags&RTCF_LOCAL) {
+ ip_rt_put(rt);
+ rt = NULL;
+ fl.fl4_dst = eiph->daddr;
+ fl.fl4_src = eiph->saddr;
+ fl.fl4_tos = eiph->tos;
+ if (ip_route_output_key(&rt, &fl) ||
+ rt->u.dst.dev->type != ARPHRD_IPGRE) {
+ ip_rt_put(rt);
+ kfree_skb(skb2);
+ return;
+ }
+ } else {
+ ip_rt_put(rt);
+ if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
+ skb2->dst->dev->type != ARPHRD_IPGRE) {
+ kfree_skb(skb2);
+ return;
+ }
+ }
+
+ /* change mtu on this route */
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+ if (rel_info > dst_mtu(skb2->dst)) {
+ kfree_skb(skb2);
+ return;
+ }
+ skb2->dst->ops->update_pmtu(skb2->dst, rel_info);
+ rel_info = htonl(rel_info);
+ } else if (type == ICMP_TIME_EXCEEDED) {
+ struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv;
+ if (t->parms.iph.ttl) {
+ rel_type = ICMP_DEST_UNREACH;
+ rel_code = ICMP_HOST_UNREACH;
+ }
+ }
+
+ icmp_send(skb2, rel_type, rel_code, rel_info);
+ kfree_skb(skb2);
+#endif
+}
+
+static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
+{
+ if (INET_ECN_is_ce(iph->tos)) {
+ if (skb->protocol == htons(ETH_P_IP)) {
+ IP_ECN_set_ce(skb->nh.iph);
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ IP6_ECN_set_ce(skb->nh.ipv6h);
+ }
+ }
+}
+
+static inline u8
+ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
+{
+ u8 inner = 0;
+ if (skb->protocol == htons(ETH_P_IP))
+ inner = old_iph->tos;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
+ return INET_ECN_encapsulate(tos, inner);
+}
+
+static int ipgre_rcv(struct sk_buff *skb)
+{
+ struct iphdr *iph;
+ u8 *h;
+ u16 flags;
+ u16 csum = 0;
+ u32 key = 0;
+ u32 seqno = 0;
+ struct ip_tunnel *tunnel;
+ int offset = 4;
+
+ if (!pskb_may_pull(skb, 16))
+ goto drop_nolock;
+
+ iph = skb->nh.iph;
+ h = skb->data;
+ flags = *(u16*)h;
+
+ if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
+ /* - Version must be 0.
+ - We do not support routing headers.
+ */
+ if (flags&(GRE_VERSION|GRE_ROUTING))
+ goto drop_nolock;
+
+ if (flags&GRE_CSUM) {
+ if (skb->ip_summed == CHECKSUM_HW) {
+ csum = (u16)csum_fold(skb->csum);
+ if (csum)
+ skb->ip_summed = CHECKSUM_NONE;
+ }
+ if (skb->ip_summed == CHECKSUM_NONE) {
+ skb->csum = skb_checksum(skb, 0, skb->len, 0);
+ skb->ip_summed = CHECKSUM_HW;
+ csum = (u16)csum_fold(skb->csum);
+ }
+ offset += 4;
+ }
+ if (flags&GRE_KEY) {
+ key = *(u32*)(h + offset);
+ offset += 4;
+ }
+ if (flags&GRE_SEQ) {
+ seqno = ntohl(*(u32*)(h + offset));
+ offset += 4;
+ }
+ }
+
+ read_lock(&ipgre_lock);
+ if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) {
+ secpath_reset(skb);
+
+ skb->protocol = *(u16*)(h + 2);
+ /* WCCP version 1 and 2 protocol decoding.
+ * - Change protocol to IP
+ * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
+ */
+ if (flags == 0 &&
+ skb->protocol == __constant_htons(ETH_P_WCCP)) {
+ skb->protocol = __constant_htons(ETH_P_IP);
+ if ((*(h + offset) & 0xF0) != 0x40)
+ offset += 4;
+ }
+
+ skb->mac.raw = skb->nh.raw;
+ skb->nh.raw = __pskb_pull(skb, offset);
+ skb_postpull_rcsum(skb, skb->mac.raw, offset);
+ memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+ skb->pkt_type = PACKET_HOST;
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ if (MULTICAST(iph->daddr)) {
+ /* Looped back packet, drop it! */
+ if (((struct rtable*)skb->dst)->fl.iif == 0)
+ goto drop;
+ tunnel->stat.multicast++;
+ skb->pkt_type = PACKET_BROADCAST;
+ }
+#endif
+
+ if (((flags&GRE_CSUM) && csum) ||
+ (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
+ tunnel->stat.rx_crc_errors++;
+ tunnel->stat.rx_errors++;
+ goto drop;
+ }
+ if (tunnel->parms.i_flags&GRE_SEQ) {
+ if (!(flags&GRE_SEQ) ||
+ (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
+ tunnel->stat.rx_fifo_errors++;
+ tunnel->stat.rx_errors++;
+ goto drop;
+ }
+ tunnel->i_seqno = seqno + 1;
+ }
+ tunnel->stat.rx_packets++;
+ tunnel->stat.rx_bytes += skb->len;
+ skb->dev = tunnel->dev;
+ dst_release(skb->dst);
+ skb->dst = NULL;
+ nf_reset(skb);
+ ipgre_ecn_decapsulate(iph, skb);
+ netif_rx(skb);
+ read_unlock(&ipgre_lock);
+ return(0);
+ }
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);
+
+drop:
+ read_unlock(&ipgre_lock);
+drop_nolock:
+ kfree_skb(skb);
+ return(0);
+}
+
+static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
+ struct net_device_stats *stats = &tunnel->stat;
+ struct iphdr *old_iph = skb->nh.iph;
+ struct iphdr *tiph;
+ u8 tos;
+ u16 df;
+ struct rtable *rt; /* Route to the other host */
+ struct net_device *tdev; /* Device to other host */
+ struct iphdr *iph; /* Our new IP header */
+ int max_headroom; /* The extra header space needed */
+ int gre_hlen;
+ u32 dst;
+ int mtu;
+
+ if (tunnel->recursion++) {
+ tunnel->stat.collisions++;
+ goto tx_error;
+ }
+
+ if (dev->hard_header) {
+ gre_hlen = 0;
+ tiph = (struct iphdr*)skb->data;
+ } else {
+ gre_hlen = tunnel->hlen;
+ tiph = &tunnel->parms.iph;
+ }
+
+ if ((dst = tiph->daddr) == 0) {
+ /* NBMA tunnel */
+
+ if (skb->dst == NULL) {
+ tunnel->stat.tx_fifo_errors++;
+ goto tx_error;
+ }
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ rt = (struct rtable*)skb->dst;
+ if ((dst = rt->rt_gateway) == 0)
+ goto tx_error_icmp;
+ }
+#ifdef CONFIG_IPV6
+ else if (skb->protocol == htons(ETH_P_IPV6)) {
+ struct in6_addr *addr6;
+ int addr_type;
+ struct neighbour *neigh = skb->dst->neighbour;
+
+ if (neigh == NULL)
+ goto tx_error;
+
+ addr6 = (struct in6_addr*)&neigh->primary_key;
+ addr_type = ipv6_addr_type(addr6);
+
+ if (addr_type == IPV6_ADDR_ANY) {
+ addr6 = &skb->nh.ipv6h->daddr;
+ addr_type = ipv6_addr_type(addr6);
+ }
+
+ if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
+ goto tx_error_icmp;
+
+ dst = addr6->s6_addr32[3];
+ }
+#endif
+ else
+ goto tx_error;
+ }
+
+ tos = tiph->tos;
+ if (tos&1) {
+ if (skb->protocol == htons(ETH_P_IP))
+ tos = old_iph->tos;
+ tos &= ~1;
+ }
+
+ {
+ struct flowi fl = { .oif = tunnel->parms.link,
+ .nl_u = { .ip4_u =
+ { .daddr = dst,
+ .saddr = tiph->saddr,
+ .tos = RT_TOS(tos) } },
+ .proto = IPPROTO_GRE };
+ if (ip_route_output_key(&rt, &fl)) {
+ tunnel->stat.tx_carrier_errors++;
+ goto tx_error;
+ }
+ }
+ tdev = rt->u.dst.dev;
+
+ if (tdev == dev) {
+ ip_rt_put(rt);
+ tunnel->stat.collisions++;
+ goto tx_error;
+ }
+
+ df = tiph->frag_off;
+ if (df)
+ mtu = dst_mtu(&rt->u.dst) - tunnel->hlen;
+ else
+ mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
+
+ if (skb->dst)
+ skb->dst->ops->update_pmtu(skb->dst, mtu);
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ df |= (old_iph->frag_off&htons(IP_DF));
+
+ if ((old_iph->frag_off&htons(IP_DF)) &&
+ mtu < ntohs(old_iph->tot_len)) {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+ }
+#ifdef CONFIG_IPV6
+ else if (skb->protocol == htons(ETH_P_IPV6)) {
+ struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
+
+ if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
+ if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) ||
+ rt6->rt6i_dst.plen == 128) {
+ rt6->rt6i_flags |= RTF_MODIFIED;
+ skb->dst->metrics[RTAX_MTU-1] = mtu;
+ }
+ }
+
+ if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+ }
+#endif
+
+ if (tunnel->err_count > 0) {
+ if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
+ tunnel->err_count--;
+
+ dst_link_failure(skb);
+ } else
+ tunnel->err_count = 0;
+ }
+
+ max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
+
+ if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
+ struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
+ if (!new_skb) {
+ ip_rt_put(rt);
+ stats->tx_dropped++;
+ dev_kfree_skb(skb);
+ tunnel->recursion--;
+ return 0;
+ }
+ if (skb->sk)
+ skb_set_owner_w(new_skb, skb->sk);
+ dev_kfree_skb(skb);
+ skb = new_skb;
+ old_iph = skb->nh.iph;
+ }
+
+ skb->h.raw = skb->nh.raw;
+ skb->nh.raw = skb_push(skb, gre_hlen);
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+ dst_release(skb->dst);
+ skb->dst = &rt->u.dst;
+
+ /*
+ * Push down and install the IPIP header.
+ */
+
+ iph = skb->nh.iph;
+ iph->version = 4;
+ iph->ihl = sizeof(struct iphdr) >> 2;
+ iph->frag_off = df;
+ iph->protocol = IPPROTO_GRE;
+ iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
+ iph->daddr = rt->rt_dst;
+ iph->saddr = rt->rt_src;
+
+ if ((iph->ttl = tiph->ttl) == 0) {
+ if (skb->protocol == htons(ETH_P_IP))
+ iph->ttl = old_iph->ttl;
+#ifdef CONFIG_IPV6
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
+#endif
+ else
+ iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
+ }
+
+ ((u16*)(iph+1))[0] = tunnel->parms.o_flags;
+ ((u16*)(iph+1))[1] = skb->protocol;
+
+ if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
+ u32 *ptr = (u32*)(((u8*)iph) + tunnel->hlen - 4);
+
+ if (tunnel->parms.o_flags&GRE_SEQ) {
+ ++tunnel->o_seqno;
+ *ptr = htonl(tunnel->o_seqno);
+ ptr--;
+ }
+ if (tunnel->parms.o_flags&GRE_KEY) {
+ *ptr = tunnel->parms.o_key;
+ ptr--;
+ }
+ if (tunnel->parms.o_flags&GRE_CSUM) {
+ *ptr = 0;
+ *(__u16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
+ }
+ }
+
+ nf_reset(skb);
+
+ IPTUNNEL_XMIT();
+ tunnel->recursion--;
+ return 0;
+
+tx_error_icmp:
+ dst_link_failure(skb);
+
+tx_error:
+ stats->tx_errors++;
+ dev_kfree_skb(skb);
+ tunnel->recursion--;
+ return 0;
+}
+
+static int
+ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+ int err = 0;
+ struct ip_tunnel_parm p;
+ struct ip_tunnel *t;
+
+ switch (cmd) {
+ case SIOCGETTUNNEL:
+ t = NULL;
+ if (dev == ipgre_fb_tunnel_dev) {
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+ err = -EFAULT;
+ break;
+ }
+ t = ipgre_tunnel_locate(&p, 0);
+ }
+ if (t == NULL)
+ t = (struct ip_tunnel*)dev->priv;
+ memcpy(&p, &t->parms, sizeof(p));
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ err = -EFAULT;
+ break;
+
+ case SIOCADDTUNNEL:
+ case SIOCCHGTUNNEL:
+ err = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ goto done;
+
+ err = -EFAULT;
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ goto done;
+
+ err = -EINVAL;
+ if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
+ p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
+ ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
+ goto done;
+ if (p.iph.ttl)
+ p.iph.frag_off |= htons(IP_DF);
+
+ if (!(p.i_flags&GRE_KEY))
+ p.i_key = 0;
+ if (!(p.o_flags&GRE_KEY))
+ p.o_key = 0;
+
+ t = ipgre_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
+
+ if (dev != ipgre_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
+ if (t != NULL) {
+ if (t->dev != dev) {
+ err = -EEXIST;
+ break;
+ }
+ } else {
+ unsigned nflags=0;
+
+ t = (struct ip_tunnel*)dev->priv;
+
+ if (MULTICAST(p.iph.daddr))
+ nflags = IFF_BROADCAST;
+ else if (p.iph.daddr)
+ nflags = IFF_POINTOPOINT;
+
+ if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
+ err = -EINVAL;
+ break;
+ }
+ ipgre_tunnel_unlink(t);
+ t->parms.iph.saddr = p.iph.saddr;
+ t->parms.iph.daddr = p.iph.daddr;
+ t->parms.i_key = p.i_key;
+ t->parms.o_key = p.o_key;
+ memcpy(dev->dev_addr, &p.iph.saddr, 4);
+ memcpy(dev->broadcast, &p.iph.daddr, 4);
+ ipgre_tunnel_link(t);
+ netdev_state_change(dev);
+ }
+ }
+
+ if (t) {
+ err = 0;
+ if (cmd == SIOCCHGTUNNEL) {
+ t->parms.iph.ttl = p.iph.ttl;
+ t->parms.iph.tos = p.iph.tos;
+ t->parms.iph.frag_off = p.iph.frag_off;
+ }
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
+ err = -EFAULT;
+ } else
+ err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+ break;
+
+ case SIOCDELTUNNEL:
+ err = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ goto done;
+
+ if (dev == ipgre_fb_tunnel_dev) {
+ err = -EFAULT;
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ goto done;
+ err = -ENOENT;
+ if ((t = ipgre_tunnel_locate(&p, 0)) == NULL)
+ goto done;
+ err = -EPERM;
+ if (t == ipgre_fb_tunnel_dev->priv)
+ goto done;
+ dev = t->dev;
+ }
+ err = unregister_netdevice(dev);
+ break;
+
+ default:
+ err = -EINVAL;
+ }
+
+done:
+ return err;
+}
+
+static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev)
+{
+ return &(((struct ip_tunnel*)dev->priv)->stat);
+}
+
+static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
+ if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
+ return -EINVAL;
+ dev->mtu = new_mtu;
+ return 0;
+}
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+/* Nice toy. Unfortunately, useless in real life :-)
+ It allows to construct virtual multiprotocol broadcast "LAN"
+ over the Internet, provided multicast routing is tuned.
+
+
+ I have no idea was this bicycle invented before me,
+ so that I had to set ARPHRD_IPGRE to a random value.
+ I have an impression, that Cisco could make something similar,
+ but this feature is apparently missing in IOS<=11.2(8).
+
+ I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
+ with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
+
+ ping -t 255 224.66.66.66
+
+ If nobody answers, mbone does not work.
+
+ ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
+ ip addr add 10.66.66.<somewhat>/24 dev Universe
+ ifconfig Universe up
+ ifconfig Universe add fe80::<Your_real_addr>/10
+ ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
+ ftp 10.66.66.66
+ ...
+ ftp fec0:6666:6666::193.233.7.65
+ ...
+
+ */
+
+static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
+ void *daddr, void *saddr, unsigned len)
+{
+ struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
+ struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
+ u16 *p = (u16*)(iph+1);
+
+ memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
+ p[0] = t->parms.o_flags;
+ p[1] = htons(type);
+
+ /*
+ * Set the source hardware address.
+ */
+
+ if (saddr)
+ memcpy(&iph->saddr, saddr, 4);
+
+ if (daddr) {
+ memcpy(&iph->daddr, daddr, 4);
+ return t->hlen;
+ }
+ if (iph->daddr && !MULTICAST(iph->daddr))
+ return t->hlen;
+
+ return -t->hlen;
+}
+
+static int ipgre_open(struct net_device *dev)
+{
+ struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
+
+ if (MULTICAST(t->parms.iph.daddr)) {
+ struct flowi fl = { .oif = t->parms.link,
+ .nl_u = { .ip4_u =
+ { .daddr = t->parms.iph.daddr,
+ .saddr = t->parms.iph.saddr,
+ .tos = RT_TOS(t->parms.iph.tos) } },
+ .proto = IPPROTO_GRE };
+ struct rtable *rt;
+ if (ip_route_output_key(&rt, &fl))
+ return -EADDRNOTAVAIL;
+ dev = rt->u.dst.dev;
+ ip_rt_put(rt);
+ if (__in_dev_get(dev) == NULL)
+ return -EADDRNOTAVAIL;
+ t->mlink = dev->ifindex;
+ ip_mc_inc_group(__in_dev_get(dev), t->parms.iph.daddr);
+ }
+ return 0;
+}
+
+static int ipgre_close(struct net_device *dev)
+{
+ struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
+ if (MULTICAST(t->parms.iph.daddr) && t->mlink) {
+ struct in_device *in_dev = inetdev_by_index(t->mlink);
+ if (in_dev) {
+ ip_mc_dec_group(in_dev, t->parms.iph.daddr);
+ in_dev_put(in_dev);
+ }
+ }
+ return 0;
+}
+
+#endif
+
+static void ipgre_tunnel_setup(struct net_device *dev)
+{
+ SET_MODULE_OWNER(dev);
+ dev->uninit = ipgre_tunnel_uninit;
+ dev->destructor = free_netdev;
+ dev->hard_start_xmit = ipgre_tunnel_xmit;
+ dev->get_stats = ipgre_tunnel_get_stats;
+ dev->do_ioctl = ipgre_tunnel_ioctl;
+ dev->change_mtu = ipgre_tunnel_change_mtu;
+
+ dev->type = ARPHRD_IPGRE;
+ dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
+ dev->mtu = 1500 - sizeof(struct iphdr) - 4;
+ dev->flags = IFF_NOARP;
+ dev->iflink = 0;
+ dev->addr_len = 4;
+}
+
+static int ipgre_tunnel_init(struct net_device *dev)
+{
+ struct net_device *tdev = NULL;
+ struct ip_tunnel *tunnel;
+ struct iphdr *iph;
+ int hlen = LL_MAX_HEADER;
+ int mtu = 1500;
+ int addend = sizeof(struct iphdr) + 4;
+
+ tunnel = (struct ip_tunnel*)dev->priv;
+ iph = &tunnel->parms.iph;
+
+ tunnel->dev = dev;
+ strcpy(tunnel->parms.name, dev->name);
+
+ memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
+ memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
+
+ /* Guess output device to choose reasonable mtu and hard_header_len */
+
+ if (iph->daddr) {
+ struct flowi fl = { .oif = tunnel->parms.link,
+ .nl_u = { .ip4_u =
+ { .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .tos = RT_TOS(iph->tos) } },
+ .proto = IPPROTO_GRE };
+ struct rtable *rt;
+ if (!ip_route_output_key(&rt, &fl)) {
+ tdev = rt->u.dst.dev;
+ ip_rt_put(rt);
+ }
+
+ dev->flags |= IFF_POINTOPOINT;
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ if (MULTICAST(iph->daddr)) {
+ if (!iph->saddr)
+ return -EINVAL;
+ dev->flags = IFF_BROADCAST;
+ dev->hard_header = ipgre_header;
+ dev->open = ipgre_open;
+ dev->stop = ipgre_close;
+ }
+#endif
+ }
+
+ if (!tdev && tunnel->parms.link)
+ tdev = __dev_get_by_index(tunnel->parms.link);
+
+ if (tdev) {
+ hlen = tdev->hard_header_len;
+ mtu = tdev->mtu;
+ }
+ dev->iflink = tunnel->parms.link;
+
+ /* Precalculate GRE options length */
+ if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
+ if (tunnel->parms.o_flags&GRE_CSUM)
+ addend += 4;
+ if (tunnel->parms.o_flags&GRE_KEY)
+ addend += 4;
+ if (tunnel->parms.o_flags&GRE_SEQ)
+ addend += 4;
+ }
+ dev->hard_header_len = hlen + addend;
+ dev->mtu = mtu - addend;
+ tunnel->hlen = addend;
+ return 0;
+}
+
+int __init ipgre_fb_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
+ struct iphdr *iph = &tunnel->parms.iph;
+
+ tunnel->dev = dev;
+ strcpy(tunnel->parms.name, dev->name);
+
+ iph->version = 4;
+ iph->protocol = IPPROTO_GRE;
+ iph->ihl = 5;
+ tunnel->hlen = sizeof(struct iphdr) + 4;
+
+ dev_hold(dev);
+ tunnels_wc[0] = tunnel;
+ return 0;
+}
+
+
+static struct net_protocol ipgre_protocol = {
+ .handler = ipgre_rcv,
+ .err_handler = ipgre_err,
+};
+
+
+/*
+ * And now the modules code and kernel interface.
+ */
+
+static int __init ipgre_init(void)
+{
+ int err;
+
+ printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
+
+ if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
+ printk(KERN_INFO "ipgre init: can't add protocol\n");
+ return -EAGAIN;
+ }
+
+ ipgre_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
+ ipgre_tunnel_setup);
+ if (!ipgre_fb_tunnel_dev) {
+ err = -ENOMEM;
+ goto err1;
+ }
+
+ ipgre_fb_tunnel_dev->init = ipgre_fb_tunnel_init;
+
+ if ((err = register_netdev(ipgre_fb_tunnel_dev)))
+ goto err2;
+out:
+ return err;
+err2:
+ free_netdev(ipgre_fb_tunnel_dev);
+err1:
+ inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
+ goto out;
+}
+
+static void ipgre_fini(void)
+{
+ if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
+ printk(KERN_INFO "ipgre close: can't remove protocol\n");
+
+ unregister_netdev(ipgre_fb_tunnel_dev);
+}
+
+module_init(ipgre_init);
+module_exit(ipgre_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
new file mode 100644
index 000000000000..a0d0833034be
--- /dev/null
+++ b/net/ipv4/ip_input.c
@@ -0,0 +1,431 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The Internet Protocol (IP) module.
+ *
+ * Version: $Id: ip_input.c,v 1.55 2002/01/12 07:39:45 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Donald Becker, <becker@super.org>
+ * Alan Cox, <Alan.Cox@linux.org>
+ * Richard Underwood
+ * Stefan Becker, <stefanb@yello.ping.de>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *
+ *
+ * Fixes:
+ * Alan Cox : Commented a couple of minor bits of surplus code
+ * Alan Cox : Undefining IP_FORWARD doesn't include the code
+ * (just stops a compiler warning).
+ * Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes
+ * are junked rather than corrupting things.
+ * Alan Cox : Frames to bad broadcast subnets are dumped
+ * We used to process them non broadcast and
+ * boy could that cause havoc.
+ * Alan Cox : ip_forward sets the free flag on the
+ * new frame it queues. Still crap because
+ * it copies the frame but at least it
+ * doesn't eat memory too.
+ * Alan Cox : Generic queue code and memory fixes.
+ * Fred Van Kempen : IP fragment support (borrowed from NET2E)
+ * Gerhard Koerting: Forward fragmented frames correctly.
+ * Gerhard Koerting: Fixes to my fix of the above 8-).
+ * Gerhard Koerting: IP interface addressing fix.
+ * Linus Torvalds : More robustness checks
+ * Alan Cox : Even more checks: Still not as robust as it ought to be
+ * Alan Cox : Save IP header pointer for later
+ * Alan Cox : ip option setting
+ * Alan Cox : Use ip_tos/ip_ttl settings
+ * Alan Cox : Fragmentation bogosity removed
+ * (Thanks to Mark.Bush@prg.ox.ac.uk)
+ * Dmitry Gorodchanin : Send of a raw packet crash fix.
+ * Alan Cox : Silly ip bug when an overlength
+ * fragment turns up. Now frees the
+ * queue.
+ * Linus Torvalds/ : Memory leakage on fragmentation
+ * Alan Cox : handling.
+ * Gerhard Koerting: Forwarding uses IP priority hints
+ * Teemu Rantanen : Fragment problems.
+ * Alan Cox : General cleanup, comments and reformat
+ * Alan Cox : SNMP statistics
+ * Alan Cox : BSD address rule semantics. Also see
+ * UDP as there is a nasty checksum issue
+ * if you do things the wrong way.
+ * Alan Cox : Always defrag, moved IP_FORWARD to the config.in file
+ * Alan Cox : IP options adjust sk->priority.
+ * Pedro Roque : Fix mtu/length error in ip_forward.
+ * Alan Cox : Avoid ip_chk_addr when possible.
+ * Richard Underwood : IP multicasting.
+ * Alan Cox : Cleaned up multicast handlers.
+ * Alan Cox : RAW sockets demultiplex in the BSD style.
+ * Gunther Mayer : Fix the SNMP reporting typo
+ * Alan Cox : Always in group 224.0.0.1
+ * Pauline Middelink : Fast ip_checksum update when forwarding
+ * Masquerading support.
+ * Alan Cox : Multicast loopback error for 224.0.0.1
+ * Alan Cox : IP_MULTICAST_LOOP option.
+ * Alan Cox : Use notifiers.
+ * Bjorn Ekwall : Removed ip_csum (from slhc.c too)
+ * Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!)
+ * Stefan Becker : Send out ICMP HOST REDIRECT
+ * Arnt Gulbrandsen : ip_build_xmit
+ * Alan Cox : Per socket routing cache
+ * Alan Cox : Fixed routing cache, added header cache.
+ * Alan Cox : Loopback didn't work right in original ip_build_xmit - fixed it.
+ * Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net.
+ * Alan Cox : Incoming IP option handling.
+ * Alan Cox : Set saddr on raw output frames as per BSD.
+ * Alan Cox : Stopped broadcast source route explosions.
+ * Alan Cox : Can disable source routing
+ * Takeshi Sone : Masquerading didn't work.
+ * Dave Bonn,Alan Cox : Faster IP forwarding whenever possible.
+ * Alan Cox : Memory leaks, tramples, misc debugging.
+ * Alan Cox : Fixed multicast (by popular demand 8))
+ * Alan Cox : Fixed forwarding (by even more popular demand 8))
+ * Alan Cox : Fixed SNMP statistics [I think]
+ * Gerhard Koerting : IP fragmentation forwarding fix
+ * Alan Cox : Device lock against page fault.
+ * Alan Cox : IP_HDRINCL facility.
+ * Werner Almesberger : Zero fragment bug
+ * Alan Cox : RAW IP frame length bug
+ * Alan Cox : Outgoing firewall on build_xmit
+ * A.N.Kuznetsov : IP_OPTIONS support throughout the kernel
+ * Alan Cox : Multicast routing hooks
+ * Jos Vos : Do accounting *before* call_in_firewall
+ * Willy Konynenberg : Transparent proxying support
+ *
+ *
+ *
+ * To Fix:
+ * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient
+ * and could be made very efficient with the addition of some virtual memory hacks to permit
+ * the allocation of a buffer that can then be 'grown' by twiddling page tables.
+ * Output fragmentation wants updating along with the buffer management to use a single
+ * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet
+ * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause
+ * fragmentation anyway.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/system.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/config.h>
+
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/icmp.h>
+#include <net/raw.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/xfrm.h>
+#include <linux/mroute.h>
+#include <linux/netlink.h>
+
+/*
+ * SNMP management statistics
+ */
+
+DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics);
+
+/*
+ * Process Router Attention IP option
+ */
+int ip_call_ra_chain(struct sk_buff *skb)
+{
+ struct ip_ra_chain *ra;
+ u8 protocol = skb->nh.iph->protocol;
+ struct sock *last = NULL;
+
+ read_lock(&ip_ra_lock);
+ for (ra = ip_ra_chain; ra; ra = ra->next) {
+ struct sock *sk = ra->sk;
+
+ /* If socket is bound to an interface, only report
+ * the packet if it came from that interface.
+ */
+ if (sk && inet_sk(sk)->num == protocol &&
+ (!sk->sk_bound_dev_if ||
+ sk->sk_bound_dev_if == skb->dev->ifindex)) {
+ if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
+ skb = ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN);
+ if (skb == NULL) {
+ read_unlock(&ip_ra_lock);
+ return 1;
+ }
+ }
+ if (last) {
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2)
+ raw_rcv(last, skb2);
+ }
+ last = sk;
+ }
+ }
+
+ if (last) {
+ raw_rcv(last, skb);
+ read_unlock(&ip_ra_lock);
+ return 1;
+ }
+ read_unlock(&ip_ra_lock);
+ return 0;
+}
+
+static inline int ip_local_deliver_finish(struct sk_buff *skb)
+{
+ int ihl = skb->nh.iph->ihl*4;
+
+#ifdef CONFIG_NETFILTER_DEBUG
+ nf_debug_ip_local_deliver(skb);
+#endif /*CONFIG_NETFILTER_DEBUG*/
+
+ __skb_pull(skb, ihl);
+
+ /* Free reference early: we don't need it any more, and it may
+ hold ip_conntrack module loaded indefinitely. */
+ nf_reset(skb);
+
+ /* Point into the IP datagram, just past the header. */
+ skb->h.raw = skb->data;
+
+ rcu_read_lock();
+ {
+ /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
+ int protocol = skb->nh.iph->protocol;
+ int hash;
+ struct sock *raw_sk;
+ struct net_protocol *ipprot;
+
+ resubmit:
+ hash = protocol & (MAX_INET_PROTOS - 1);
+ raw_sk = sk_head(&raw_v4_htable[hash]);
+
+ /* If there maybe a raw socket we must check - if not we
+ * don't care less
+ */
+ if (raw_sk)
+ raw_v4_input(skb, skb->nh.iph, hash);
+
+ if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) {
+ int ret;
+
+ if (!ipprot->no_policy &&
+ !xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ kfree_skb(skb);
+ goto out;
+ }
+ ret = ipprot->handler(skb);
+ if (ret < 0) {
+ protocol = -ret;
+ goto resubmit;
+ }
+ IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
+ } else {
+ if (!raw_sk) {
+ if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);
+ icmp_send(skb, ICMP_DEST_UNREACH,
+ ICMP_PROT_UNREACH, 0);
+ }
+ } else
+ IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
+ kfree_skb(skb);
+ }
+ }
+ out:
+ rcu_read_unlock();
+
+ return 0;
+}
+
+/*
+ * Deliver IP Packets to the higher protocol layers.
+ */
+int ip_local_deliver(struct sk_buff *skb)
+{
+ /*
+ * Reassemble IP fragments.
+ */
+
+ if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
+ skb = ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER);
+ if (!skb)
+ return 0;
+ }
+
+ return NF_HOOK(PF_INET, NF_IP_LOCAL_IN, skb, skb->dev, NULL,
+ ip_local_deliver_finish);
+}
+
+static inline int ip_rcv_finish(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct iphdr *iph = skb->nh.iph;
+
+ /*
+ * Initialise the virtual path cache for the packet. It describes
+ * how the packet travels inside Linux networking.
+ */
+ if (skb->dst == NULL) {
+ if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))
+ goto drop;
+ }
+
+#ifdef CONFIG_NET_CLS_ROUTE
+ if (skb->dst->tclassid) {
+ struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id();
+ u32 idx = skb->dst->tclassid;
+ st[idx&0xFF].o_packets++;
+ st[idx&0xFF].o_bytes+=skb->len;
+ st[(idx>>16)&0xFF].i_packets++;
+ st[(idx>>16)&0xFF].i_bytes+=skb->len;
+ }
+#endif
+
+ if (iph->ihl > 5) {
+ struct ip_options *opt;
+
+ /* It looks as overkill, because not all
+ IP options require packet mangling.
+ But it is the easiest for now, especially taking
+ into account that combination of IP options
+ and running sniffer is extremely rare condition.
+ --ANK (980813)
+ */
+
+ if (skb_cow(skb, skb_headroom(skb))) {
+ IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+ goto drop;
+ }
+ iph = skb->nh.iph;
+
+ if (ip_options_compile(NULL, skb))
+ goto inhdr_error;
+
+ opt = &(IPCB(skb)->opt);
+ if (opt->srr) {
+ struct in_device *in_dev = in_dev_get(dev);
+ if (in_dev) {
+ if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
+ if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
+ printk(KERN_INFO "source route option %u.%u.%u.%u -> %u.%u.%u.%u\n",
+ NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+ in_dev_put(in_dev);
+ goto drop;
+ }
+ in_dev_put(in_dev);
+ }
+ if (ip_options_rcv_srr(skb))
+ goto drop;
+ }
+ }
+
+ return dst_input(skb);
+
+inhdr_error:
+ IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+/*
+ * Main IP Receive routine.
+ */
+int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+{
+ struct iphdr *iph;
+
+ /* When the interface is in promisc. mode, drop all the crap
+ * that it receives, do not try to analyse it.
+ */
+ if (skb->pkt_type == PACKET_OTHERHOST)
+ goto drop;
+
+ IP_INC_STATS_BH(IPSTATS_MIB_INRECEIVES);
+
+ if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
+ IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+ goto out;
+ }
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto inhdr_error;
+
+ iph = skb->nh.iph;
+
+ /*
+ * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum.
+ *
+ * Is the datagram acceptable?
+ *
+ * 1. Length at least the size of an ip header
+ * 2. Version of 4
+ * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
+ * 4. Doesn't have a bogus length
+ */
+
+ if (iph->ihl < 5 || iph->version != 4)
+ goto inhdr_error;
+
+ if (!pskb_may_pull(skb, iph->ihl*4))
+ goto inhdr_error;
+
+ iph = skb->nh.iph;
+
+ if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
+ goto inhdr_error;
+
+ {
+ __u32 len = ntohs(iph->tot_len);
+ if (skb->len < len || len < (iph->ihl<<2))
+ goto inhdr_error;
+
+ /* Our transport medium may have padded the buffer out. Now we know it
+ * is IP we can trim to the true length of the frame.
+ * Note this now means skb->len holds ntohs(iph->tot_len).
+ */
+ if (pskb_trim_rcsum(skb, len)) {
+ IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+ goto drop;
+ }
+ }
+
+ return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,
+ ip_rcv_finish);
+
+inhdr_error:
+ IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
+drop:
+ kfree_skb(skb);
+out:
+ return NET_RX_DROP;
+}
+
+EXPORT_SYMBOL(ip_rcv);
+EXPORT_SYMBOL(ip_statistics);
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
new file mode 100644
index 000000000000..6d89f3f3e701
--- /dev/null
+++ b/net/ipv4/ip_options.c
@@ -0,0 +1,625 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The options processing module for ip.c
+ *
+ * Version: $Id: ip_options.c,v 1.21 2001/09/01 00:31:50 davem Exp $
+ *
+ * Authors: A.N.Kuznetsov
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+
+/*
+ * Write options to IP header, record destination address to
+ * source route option, address of outgoing interface
+ * (we should already know it, so that this function is allowed be
+ * called only after routing decision) and timestamp,
+ * if we originate this datagram.
+ *
+ * daddr is real destination address, next hop is recorded in IP header.
+ * saddr is address of outgoing interface.
+ */
+
+void ip_options_build(struct sk_buff * skb, struct ip_options * opt,
+ u32 daddr, struct rtable *rt, int is_frag)
+{
+ unsigned char * iph = skb->nh.raw;
+
+ memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options));
+ memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen);
+ opt = &(IPCB(skb)->opt);
+ opt->is_data = 0;
+
+ if (opt->srr)
+ memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4);
+
+ if (!is_frag) {
+ if (opt->rr_needaddr)
+ ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, rt);
+ if (opt->ts_needaddr)
+ ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, rt);
+ if (opt->ts_needtime) {
+ struct timeval tv;
+ __u32 midtime;
+ do_gettimeofday(&tv);
+ midtime = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000);
+ memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4);
+ }
+ return;
+ }
+ if (opt->rr) {
+ memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]);
+ opt->rr = 0;
+ opt->rr_needaddr = 0;
+ }
+ if (opt->ts) {
+ memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]);
+ opt->ts = 0;
+ opt->ts_needaddr = opt->ts_needtime = 0;
+ }
+}
+
+/*
+ * Provided (sopt, skb) points to received options,
+ * build in dopt compiled option set appropriate for answering.
+ * i.e. invert SRR option, copy anothers,
+ * and grab room in RR/TS options.
+ *
+ * NOTE: dopt cannot point to skb.
+ */
+
+int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
+{
+ struct ip_options *sopt;
+ unsigned char *sptr, *dptr;
+ int soffset, doffset;
+ int optlen;
+ u32 daddr;
+
+ memset(dopt, 0, sizeof(struct ip_options));
+
+ dopt->is_data = 1;
+
+ sopt = &(IPCB(skb)->opt);
+
+ if (sopt->optlen == 0) {
+ dopt->optlen = 0;
+ return 0;
+ }
+
+ sptr = skb->nh.raw;
+ dptr = dopt->__data;
+
+ if (skb->dst)
+ daddr = ((struct rtable*)skb->dst)->rt_spec_dst;
+ else
+ daddr = skb->nh.iph->daddr;
+
+ if (sopt->rr) {
+ optlen = sptr[sopt->rr+1];
+ soffset = sptr[sopt->rr+2];
+ dopt->rr = dopt->optlen + sizeof(struct iphdr);
+ memcpy(dptr, sptr+sopt->rr, optlen);
+ if (sopt->rr_needaddr && soffset <= optlen) {
+ if (soffset + 3 > optlen)
+ return -EINVAL;
+ dptr[2] = soffset + 4;
+ dopt->rr_needaddr = 1;
+ }
+ dptr += optlen;
+ dopt->optlen += optlen;
+ }
+ if (sopt->ts) {
+ optlen = sptr[sopt->ts+1];
+ soffset = sptr[sopt->ts+2];
+ dopt->ts = dopt->optlen + sizeof(struct iphdr);
+ memcpy(dptr, sptr+sopt->ts, optlen);
+ if (soffset <= optlen) {
+ if (sopt->ts_needaddr) {
+ if (soffset + 3 > optlen)
+ return -EINVAL;
+ dopt->ts_needaddr = 1;
+ soffset += 4;
+ }
+ if (sopt->ts_needtime) {
+ if (soffset + 3 > optlen)
+ return -EINVAL;
+ if ((dptr[3]&0xF) != IPOPT_TS_PRESPEC) {
+ dopt->ts_needtime = 1;
+ soffset += 4;
+ } else {
+ dopt->ts_needtime = 0;
+
+ if (soffset + 8 <= optlen) {
+ __u32 addr;
+
+ memcpy(&addr, sptr+soffset-1, 4);
+ if (inet_addr_type(addr) != RTN_LOCAL) {
+ dopt->ts_needtime = 1;
+ soffset += 8;
+ }
+ }
+ }
+ }
+ dptr[2] = soffset;
+ }
+ dptr += optlen;
+ dopt->optlen += optlen;
+ }
+ if (sopt->srr) {
+ unsigned char * start = sptr+sopt->srr;
+ u32 faddr;
+
+ optlen = start[1];
+ soffset = start[2];
+ doffset = 0;
+ if (soffset > optlen)
+ soffset = optlen + 1;
+ soffset -= 4;
+ if (soffset > 3) {
+ memcpy(&faddr, &start[soffset-1], 4);
+ for (soffset-=4, doffset=4; soffset > 3; soffset-=4, doffset+=4)
+ memcpy(&dptr[doffset-1], &start[soffset-1], 4);
+ /*
+ * RFC1812 requires to fix illegal source routes.
+ */
+ if (memcmp(&skb->nh.iph->saddr, &start[soffset+3], 4) == 0)
+ doffset -= 4;
+ }
+ if (doffset > 3) {
+ memcpy(&start[doffset-1], &daddr, 4);
+ dopt->faddr = faddr;
+ dptr[0] = start[0];
+ dptr[1] = doffset+3;
+ dptr[2] = 4;
+ dptr += doffset+3;
+ dopt->srr = dopt->optlen + sizeof(struct iphdr);
+ dopt->optlen += doffset+3;
+ dopt->is_strictroute = sopt->is_strictroute;
+ }
+ }
+ while (dopt->optlen & 3) {
+ *dptr++ = IPOPT_END;
+ dopt->optlen++;
+ }
+ return 0;
+}
+
+/*
+ * Options "fragmenting", just fill options not
+ * allowed in fragments with NOOPs.
+ * Simple and stupid 8), but the most efficient way.
+ */
+
+void ip_options_fragment(struct sk_buff * skb)
+{
+ unsigned char * optptr = skb->nh.raw;
+ struct ip_options * opt = &(IPCB(skb)->opt);
+ int l = opt->optlen;
+ int optlen;
+
+ while (l > 0) {
+ switch (*optptr) {
+ case IPOPT_END:
+ return;
+ case IPOPT_NOOP:
+ l--;
+ optptr++;
+ continue;
+ }
+ optlen = optptr[1];
+ if (optlen<2 || optlen>l)
+ return;
+ if (!IPOPT_COPIED(*optptr))
+ memset(optptr, IPOPT_NOOP, optlen);
+ l -= optlen;
+ optptr += optlen;
+ }
+ opt->ts = 0;
+ opt->rr = 0;
+ opt->rr_needaddr = 0;
+ opt->ts_needaddr = 0;
+ opt->ts_needtime = 0;
+ return;
+}
+
+/*
+ * Verify options and fill pointers in struct options.
+ * Caller should clear *opt, and set opt->data.
+ * If opt == NULL, then skb->data should point to IP header.
+ */
+
+int ip_options_compile(struct ip_options * opt, struct sk_buff * skb)
+{
+ int l;
+ unsigned char * iph;
+ unsigned char * optptr;
+ int optlen;
+ unsigned char * pp_ptr = NULL;
+ struct rtable *rt = skb ? (struct rtable*)skb->dst : NULL;
+
+ if (!opt) {
+ opt = &(IPCB(skb)->opt);
+ memset(opt, 0, sizeof(struct ip_options));
+ iph = skb->nh.raw;
+ opt->optlen = ((struct iphdr *)iph)->ihl*4 - sizeof(struct iphdr);
+ optptr = iph + sizeof(struct iphdr);
+ opt->is_data = 0;
+ } else {
+ optptr = opt->is_data ? opt->__data : (unsigned char*)&(skb->nh.iph[1]);
+ iph = optptr - sizeof(struct iphdr);
+ }
+
+ for (l = opt->optlen; l > 0; ) {
+ switch (*optptr) {
+ case IPOPT_END:
+ for (optptr++, l--; l>0; optptr++, l--) {
+ if (*optptr != IPOPT_END) {
+ *optptr = IPOPT_END;
+ opt->is_changed = 1;
+ }
+ }
+ goto eol;
+ case IPOPT_NOOP:
+ l--;
+ optptr++;
+ continue;
+ }
+ optlen = optptr[1];
+ if (optlen<2 || optlen>l) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ switch (*optptr) {
+ case IPOPT_SSRR:
+ case IPOPT_LSRR:
+ if (optlen < 3) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ if (optptr[2] < 4) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ /* NB: cf RFC-1812 5.2.4.1 */
+ if (opt->srr) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ if (!skb) {
+ if (optptr[2] != 4 || optlen < 7 || ((optlen-3) & 3)) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ memcpy(&opt->faddr, &optptr[3], 4);
+ if (optlen > 7)
+ memmove(&optptr[3], &optptr[7], optlen-7);
+ }
+ opt->is_strictroute = (optptr[0] == IPOPT_SSRR);
+ opt->srr = optptr - iph;
+ break;
+ case IPOPT_RR:
+ if (opt->rr) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ if (optlen < 3) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ if (optptr[2] < 4) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ if (optptr[2] <= optlen) {
+ if (optptr[2]+3 > optlen) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ if (skb) {
+ memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
+ opt->is_changed = 1;
+ }
+ optptr[2] += 4;
+ opt->rr_needaddr = 1;
+ }
+ opt->rr = optptr - iph;
+ break;
+ case IPOPT_TIMESTAMP:
+ if (opt->ts) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ if (optlen < 4) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ if (optptr[2] < 5) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ if (optptr[2] <= optlen) {
+ __u32 * timeptr = NULL;
+ if (optptr[2]+3 > optptr[1]) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ switch (optptr[3]&0xF) {
+ case IPOPT_TS_TSONLY:
+ opt->ts = optptr - iph;
+ if (skb)
+ timeptr = (__u32*)&optptr[optptr[2]-1];
+ opt->ts_needtime = 1;
+ optptr[2] += 4;
+ break;
+ case IPOPT_TS_TSANDADDR:
+ if (optptr[2]+7 > optptr[1]) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ opt->ts = optptr - iph;
+ if (skb) {
+ memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
+ timeptr = (__u32*)&optptr[optptr[2]+3];
+ }
+ opt->ts_needaddr = 1;
+ opt->ts_needtime = 1;
+ optptr[2] += 8;
+ break;
+ case IPOPT_TS_PRESPEC:
+ if (optptr[2]+7 > optptr[1]) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ opt->ts = optptr - iph;
+ {
+ u32 addr;
+ memcpy(&addr, &optptr[optptr[2]-1], 4);
+ if (inet_addr_type(addr) == RTN_UNICAST)
+ break;
+ if (skb)
+ timeptr = (__u32*)&optptr[optptr[2]+3];
+ }
+ opt->ts_needtime = 1;
+ optptr[2] += 8;
+ break;
+ default:
+ if (!skb && !capable(CAP_NET_RAW)) {
+ pp_ptr = optptr + 3;
+ goto error;
+ }
+ break;
+ }
+ if (timeptr) {
+ struct timeval tv;
+ __u32 midtime;
+ do_gettimeofday(&tv);
+ midtime = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000);
+ memcpy(timeptr, &midtime, sizeof(__u32));
+ opt->is_changed = 1;
+ }
+ } else {
+ unsigned overflow = optptr[3]>>4;
+ if (overflow == 15) {
+ pp_ptr = optptr + 3;
+ goto error;
+ }
+ opt->ts = optptr - iph;
+ if (skb) {
+ optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4);
+ opt->is_changed = 1;
+ }
+ }
+ break;
+ case IPOPT_RA:
+ if (optlen < 4) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ if (optptr[2] == 0 && optptr[3] == 0)
+ opt->router_alert = optptr - iph;
+ break;
+ case IPOPT_SEC:
+ case IPOPT_SID:
+ default:
+ if (!skb && !capable(CAP_NET_RAW)) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ break;
+ }
+ l -= optlen;
+ optptr += optlen;
+ }
+
+eol:
+ if (!pp_ptr)
+ return 0;
+
+error:
+ if (skb) {
+ icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24));
+ }
+ return -EINVAL;
+}
+
+
+/*
+ * Undo all the changes done by ip_options_compile().
+ */
+
+void ip_options_undo(struct ip_options * opt)
+{
+ if (opt->srr) {
+ unsigned char * optptr = opt->__data+opt->srr-sizeof(struct iphdr);
+ memmove(optptr+7, optptr+3, optptr[1]-7);
+ memcpy(optptr+3, &opt->faddr, 4);
+ }
+ if (opt->rr_needaddr) {
+ unsigned char * optptr = opt->__data+opt->rr-sizeof(struct iphdr);
+ optptr[2] -= 4;
+ memset(&optptr[optptr[2]-1], 0, 4);
+ }
+ if (opt->ts) {
+ unsigned char * optptr = opt->__data+opt->ts-sizeof(struct iphdr);
+ if (opt->ts_needtime) {
+ optptr[2] -= 4;
+ memset(&optptr[optptr[2]-1], 0, 4);
+ if ((optptr[3]&0xF) == IPOPT_TS_PRESPEC)
+ optptr[2] -= 4;
+ }
+ if (opt->ts_needaddr) {
+ optptr[2] -= 4;
+ memset(&optptr[optptr[2]-1], 0, 4);
+ }
+ }
+}
+
+int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, int user)
+{
+ struct ip_options *opt;
+
+ opt = kmalloc(sizeof(struct ip_options)+((optlen+3)&~3), GFP_KERNEL);
+ if (!opt)
+ return -ENOMEM;
+ memset(opt, 0, sizeof(struct ip_options));
+ if (optlen) {
+ if (user) {
+ if (copy_from_user(opt->__data, data, optlen)) {
+ kfree(opt);
+ return -EFAULT;
+ }
+ } else
+ memcpy(opt->__data, data, optlen);
+ }
+ while (optlen & 3)
+ opt->__data[optlen++] = IPOPT_END;
+ opt->optlen = optlen;
+ opt->is_data = 1;
+ opt->is_setbyuser = 1;
+ if (optlen && ip_options_compile(opt, NULL)) {
+ kfree(opt);
+ return -EINVAL;
+ }
+ if (*optp)
+ kfree(*optp);
+ *optp = opt;
+ return 0;
+}
+
+void ip_forward_options(struct sk_buff *skb)
+{
+ struct ip_options * opt = &(IPCB(skb)->opt);
+ unsigned char * optptr;
+ struct rtable *rt = (struct rtable*)skb->dst;
+ unsigned char *raw = skb->nh.raw;
+
+ if (opt->rr_needaddr) {
+ optptr = (unsigned char *)raw + opt->rr;
+ ip_rt_get_source(&optptr[optptr[2]-5], rt);
+ opt->is_changed = 1;
+ }
+ if (opt->srr_is_hit) {
+ int srrptr, srrspace;
+
+ optptr = raw + opt->srr;
+
+ for ( srrptr=optptr[2], srrspace = optptr[1];
+ srrptr <= srrspace;
+ srrptr += 4
+ ) {
+ if (srrptr + 3 > srrspace)
+ break;
+ if (memcmp(&rt->rt_dst, &optptr[srrptr-1], 4) == 0)
+ break;
+ }
+ if (srrptr + 3 <= srrspace) {
+ opt->is_changed = 1;
+ ip_rt_get_source(&optptr[srrptr-1], rt);
+ skb->nh.iph->daddr = rt->rt_dst;
+ optptr[2] = srrptr+4;
+ } else if (net_ratelimit())
+ printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n");
+ if (opt->ts_needaddr) {
+ optptr = raw + opt->ts;
+ ip_rt_get_source(&optptr[optptr[2]-9], rt);
+ opt->is_changed = 1;
+ }
+ }
+ if (opt->is_changed) {
+ opt->is_changed = 0;
+ ip_send_check(skb->nh.iph);
+ }
+}
+
+int ip_options_rcv_srr(struct sk_buff *skb)
+{
+ struct ip_options *opt = &(IPCB(skb)->opt);
+ int srrspace, srrptr;
+ u32 nexthop;
+ struct iphdr *iph = skb->nh.iph;
+ unsigned char * optptr = skb->nh.raw + opt->srr;
+ struct rtable *rt = (struct rtable*)skb->dst;
+ struct rtable *rt2;
+ int err;
+
+ if (!opt->srr)
+ return 0;
+
+ if (skb->pkt_type != PACKET_HOST)
+ return -EINVAL;
+ if (rt->rt_type == RTN_UNICAST) {
+ if (!opt->is_strictroute)
+ return 0;
+ icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl(16<<24));
+ return -EINVAL;
+ }
+ if (rt->rt_type != RTN_LOCAL)
+ return -EINVAL;
+
+ for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) {
+ if (srrptr + 3 > srrspace) {
+ icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24));
+ return -EINVAL;
+ }
+ memcpy(&nexthop, &optptr[srrptr-1], 4);
+
+ rt = (struct rtable*)skb->dst;
+ skb->dst = NULL;
+ err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
+ rt2 = (struct rtable*)skb->dst;
+ if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
+ ip_rt_put(rt2);
+ skb->dst = &rt->u.dst;
+ return -EINVAL;
+ }
+ ip_rt_put(rt);
+ if (rt2->rt_type != RTN_LOCAL)
+ break;
+ /* Superfast 8) loopback forward */
+ memcpy(&iph->daddr, &optptr[srrptr-1], 4);
+ opt->is_changed = 1;
+ }
+ if (srrptr <= srrspace) {
+ opt->srr_is_hit = 1;
+ opt->is_changed = 1;
+ }
+ return 0;
+}
+
+EXPORT_SYMBOL(ip_options_compile);
+EXPORT_SYMBOL(ip_options_undo);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
new file mode 100644
index 000000000000..30ab7b6ab761
--- /dev/null
+++ b/net/ipv4/ip_output.c
@@ -0,0 +1,1359 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The Internet Protocol (IP) output module.
+ *
+ * Version: $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Donald Becker, <becker@super.org>
+ * Alan Cox, <Alan.Cox@linux.org>
+ * Richard Underwood
+ * Stefan Becker, <stefanb@yello.ping.de>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Hirokazu Takahashi, <taka@valinux.co.jp>
+ *
+ * See ip_input.c for original log
+ *
+ * Fixes:
+ * Alan Cox : Missing nonblock feature in ip_build_xmit.
+ * Mike Kilburn : htons() missing in ip_build_xmit.
+ * Bradford Johnson: Fix faulty handling of some frames when
+ * no route is found.
+ * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
+ * (in case if packet not accepted by
+ * output firewall rules)
+ * Mike McLagan : Routing by source
+ * Alexey Kuznetsov: use new route cache
+ * Andi Kleen: Fix broken PMTU recovery and remove
+ * some redundant tests.
+ * Vitaly E. Lavrov : Transparent proxy revived after year coma.
+ * Andi Kleen : Replace ip_reply with ip_send_reply.
+ * Andi Kleen : Split fast and slow ip_build_xmit path
+ * for decreased register pressure on x86
+ * and more readibility.
+ * Marc Boucher : When call_out_firewall returns FW_QUEUE,
+ * silently drop skb instead of failing with -EPERM.
+ * Detlev Wengorz : Copy protocol for fragments.
+ * Hirokazu Takahashi: HW checksumming for outgoing UDP
+ * datagrams.
+ * Hirokazu Takahashi: sendfile() on UDP works now.
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/config.h>
+
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/icmp.h>
+#include <net/raw.h>
+#include <net/checksum.h>
+#include <net/inetpeer.h>
+#include <net/checksum.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/mroute.h>
+#include <linux/netlink.h>
+
+/*
+ * Shall we try to damage output packets if routing dev changes?
+ */
+
+int sysctl_ip_dynaddr;
+int sysctl_ip_default_ttl = IPDEFTTL;
+
+/* Generate a checksum for an outgoing IP datagram. */
+__inline__ void ip_send_check(struct iphdr *iph)
+{
+ iph->check = 0;
+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+}
+
+/* dev_loopback_xmit for use with netfilter. */
+static int ip_dev_loopback_xmit(struct sk_buff *newskb)
+{
+ newskb->mac.raw = newskb->data;
+ __skb_pull(newskb, newskb->nh.raw - newskb->data);
+ newskb->pkt_type = PACKET_LOOPBACK;
+ newskb->ip_summed = CHECKSUM_UNNECESSARY;
+ BUG_TRAP(newskb->dst);
+
+#ifdef CONFIG_NETFILTER_DEBUG
+ nf_debug_ip_loopback_xmit(newskb);
+#endif
+ netif_rx(newskb);
+ return 0;
+}
+
+static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
+{
+ int ttl = inet->uc_ttl;
+
+ if (ttl < 0)
+ ttl = dst_metric(dst, RTAX_HOPLIMIT);
+ return ttl;
+}
+
+/*
+ * Add an ip header to a skbuff and send it out.
+ *
+ */
+int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
+ u32 saddr, u32 daddr, struct ip_options *opt)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct rtable *rt = (struct rtable *)skb->dst;
+ struct iphdr *iph;
+
+ /* Build the IP header. */
+ if (opt)
+ iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
+ else
+ iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
+
+ iph->version = 4;
+ iph->ihl = 5;
+ iph->tos = inet->tos;
+ if (ip_dont_fragment(sk, &rt->u.dst))
+ iph->frag_off = htons(IP_DF);
+ else
+ iph->frag_off = 0;
+ iph->ttl = ip_select_ttl(inet, &rt->u.dst);
+ iph->daddr = rt->rt_dst;
+ iph->saddr = rt->rt_src;
+ iph->protocol = sk->sk_protocol;
+ iph->tot_len = htons(skb->len);
+ ip_select_ident(iph, &rt->u.dst, sk);
+ skb->nh.iph = iph;
+
+ if (opt && opt->optlen) {
+ iph->ihl += opt->optlen>>2;
+ ip_options_build(skb, opt, daddr, rt, 0);
+ }
+ ip_send_check(iph);
+
+ skb->priority = sk->sk_priority;
+
+ /* Send it out. */
+ return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+ dst_output);
+}
+
+static inline int ip_finish_output2(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb->dst;
+ struct hh_cache *hh = dst->hh;
+ struct net_device *dev = dst->dev;
+ int hh_len = LL_RESERVED_SPACE(dev);
+
+ /* Be paranoid, rather than too clever. */
+ if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
+ struct sk_buff *skb2;
+
+ skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
+ if (skb2 == NULL) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+ if (skb->sk)
+ skb_set_owner_w(skb2, skb->sk);
+ kfree_skb(skb);
+ skb = skb2;
+ }
+
+#ifdef CONFIG_NETFILTER_DEBUG
+ nf_debug_ip_finish_output2(skb);
+#endif /*CONFIG_NETFILTER_DEBUG*/
+
+ if (hh) {
+ int hh_alen;
+
+ read_lock_bh(&hh->hh_lock);
+ hh_alen = HH_DATA_ALIGN(hh->hh_len);
+ memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
+ read_unlock_bh(&hh->hh_lock);
+ skb_push(skb, hh->hh_len);
+ return hh->hh_output(skb);
+ } else if (dst->neighbour)
+ return dst->neighbour->output(skb);
+
+ if (net_ratelimit())
+ printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+int ip_finish_output(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dst->dev;
+
+ skb->dev = dev;
+ skb->protocol = htons(ETH_P_IP);
+
+ return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
+ ip_finish_output2);
+}
+
+int ip_mc_output(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ struct rtable *rt = (struct rtable*)skb->dst;
+ struct net_device *dev = rt->u.dst.dev;
+
+ /*
+ * If the indicated interface is up and running, send the packet.
+ */
+ IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
+
+ skb->dev = dev;
+ skb->protocol = htons(ETH_P_IP);
+
+ /*
+ * Multicasts are looped back for other local users
+ */
+
+ if (rt->rt_flags&RTCF_MULTICAST) {
+ if ((!sk || inet_sk(sk)->mc_loop)
+#ifdef CONFIG_IP_MROUTE
+ /* Small optimization: do not loopback not local frames,
+ which returned after forwarding; they will be dropped
+ by ip_mr_input in any case.
+ Note, that local frames are looped back to be delivered
+ to local recipients.
+
+ This check is duplicated in ip_mr_input at the moment.
+ */
+ && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
+#endif
+ ) {
+ struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
+ if (newskb)
+ NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
+ newskb->dev,
+ ip_dev_loopback_xmit);
+ }
+
+ /* Multicasts with ttl 0 must not go beyond the host */
+
+ if (skb->nh.iph->ttl == 0) {
+ kfree_skb(skb);
+ return 0;
+ }
+ }
+
+ if (rt->rt_flags&RTCF_BROADCAST) {
+ struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
+ if (newskb)
+ NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
+ newskb->dev, ip_dev_loopback_xmit);
+ }
+
+ if (skb->len > dst_mtu(&rt->u.dst))
+ return ip_fragment(skb, ip_finish_output);
+ else
+ return ip_finish_output(skb);
+}
+
+int ip_output(struct sk_buff *skb)
+{
+ IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
+
+ if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
+ return ip_fragment(skb, ip_finish_output);
+ else
+ return ip_finish_output(skb);
+}
+
+int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
+{
+ struct sock *sk = skb->sk;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_options *opt = inet->opt;
+ struct rtable *rt;
+ struct iphdr *iph;
+
+ /* Skip all of this if the packet is already routed,
+ * f.e. by something like SCTP.
+ */
+ rt = (struct rtable *) skb->dst;
+ if (rt != NULL)
+ goto packet_routed;
+
+ /* Make sure we can route this packet. */
+ rt = (struct rtable *)__sk_dst_check(sk, 0);
+ if (rt == NULL) {
+ u32 daddr;
+
+ /* Use correct destination address if we have options. */
+ daddr = inet->daddr;
+ if(opt && opt->srr)
+ daddr = opt->faddr;
+
+ {
+ struct flowi fl = { .oif = sk->sk_bound_dev_if,
+ .nl_u = { .ip4_u =
+ { .daddr = daddr,
+ .saddr = inet->saddr,
+ .tos = RT_CONN_FLAGS(sk) } },
+ .proto = sk->sk_protocol,
+ .uli_u = { .ports =
+ { .sport = inet->sport,
+ .dport = inet->dport } } };
+
+ /* If this fails, retransmit mechanism of transport layer will
+ * keep trying until route appears or the connection times
+ * itself out.
+ */
+ if (ip_route_output_flow(&rt, &fl, sk, 0))
+ goto no_route;
+ }
+ __sk_dst_set(sk, &rt->u.dst);
+ tcp_v4_setup_caps(sk, &rt->u.dst);
+ }
+ skb->dst = dst_clone(&rt->u.dst);
+
+packet_routed:
+ if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
+ goto no_route;
+
+ /* OK, we know where to send it, allocate and build IP header. */
+ iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
+ *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
+ iph->tot_len = htons(skb->len);
+ if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
+ iph->frag_off = htons(IP_DF);
+ else
+ iph->frag_off = 0;
+ iph->ttl = ip_select_ttl(inet, &rt->u.dst);
+ iph->protocol = sk->sk_protocol;
+ iph->saddr = rt->rt_src;
+ iph->daddr = rt->rt_dst;
+ skb->nh.iph = iph;
+ /* Transport layer set skb->h.foo itself. */
+
+ if (opt && opt->optlen) {
+ iph->ihl += opt->optlen >> 2;
+ ip_options_build(skb, opt, inet->daddr, rt, 0);
+ }
+
+ ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
+
+ /* Add an IP checksum. */
+ ip_send_check(iph);
+
+ skb->priority = sk->sk_priority;
+
+ return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+ dst_output);
+
+no_route:
+ IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
+ kfree_skb(skb);
+ return -EHOSTUNREACH;
+}
+
+
+static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
+{
+ to->pkt_type = from->pkt_type;
+ to->priority = from->priority;
+ to->protocol = from->protocol;
+ to->security = from->security;
+ dst_release(to->dst);
+ to->dst = dst_clone(from->dst);
+ to->dev = from->dev;
+
+ /* Copy the flags to each fragment. */
+ IPCB(to)->flags = IPCB(from)->flags;
+
+#ifdef CONFIG_NET_SCHED
+ to->tc_index = from->tc_index;
+#endif
+#ifdef CONFIG_NETFILTER
+ to->nfmark = from->nfmark;
+ to->nfcache = from->nfcache;
+ /* Connection association is same as pre-frag packet */
+ nf_conntrack_put(to->nfct);
+ to->nfct = from->nfct;
+ nf_conntrack_get(to->nfct);
+ to->nfctinfo = from->nfctinfo;
+#ifdef CONFIG_BRIDGE_NETFILTER
+ nf_bridge_put(to->nf_bridge);
+ to->nf_bridge = from->nf_bridge;
+ nf_bridge_get(to->nf_bridge);
+#endif
+#ifdef CONFIG_NETFILTER_DEBUG
+ to->nf_debug = from->nf_debug;
+#endif
+#endif
+}
+
+/*
+ * This IP datagram is too large to be sent in one piece. Break it up into
+ * smaller pieces (each of size equal to IP header plus
+ * a block of the data of the original IP data part) that will yet fit in a
+ * single device frame, and queue such a frame for sending.
+ */
+
+int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
+{
+ struct iphdr *iph;
+ int raw = 0;
+ int ptr;
+ struct net_device *dev;
+ struct sk_buff *skb2;
+ unsigned int mtu, hlen, left, len, ll_rs;
+ int offset;
+ int not_last_frag;
+ struct rtable *rt = (struct rtable*)skb->dst;
+ int err = 0;
+
+ dev = rt->u.dst.dev;
+
+ /*
+ * Point into the IP datagram header.
+ */
+
+ iph = skb->nh.iph;
+
+ if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(dst_mtu(&rt->u.dst)));
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+
+ /*
+ * Setup starting values.
+ */
+
+ hlen = iph->ihl * 4;
+ mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */
+
+ /* When frag_list is given, use it. First, check its validity:
+ * some transformers could create wrong frag_list or break existing
+ * one, it is not prohibited. In this case fall back to copying.
+ *
+ * LATER: this step can be merged to real generation of fragments,
+ * we can switch to copy when see the first bad fragment.
+ */
+ if (skb_shinfo(skb)->frag_list) {
+ struct sk_buff *frag;
+ int first_len = skb_pagelen(skb);
+
+ if (first_len - hlen > mtu ||
+ ((first_len - hlen) & 7) ||
+ (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
+ skb_cloned(skb))
+ goto slow_path;
+
+ for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
+ /* Correct geometry. */
+ if (frag->len > mtu ||
+ ((frag->len & 7) && frag->next) ||
+ skb_headroom(frag) < hlen)
+ goto slow_path;
+
+ /* Partially cloned skb? */
+ if (skb_shared(frag))
+ goto slow_path;
+ }
+
+ /* Everything is OK. Generate! */
+
+ err = 0;
+ offset = 0;
+ frag = skb_shinfo(skb)->frag_list;
+ skb_shinfo(skb)->frag_list = NULL;
+ skb->data_len = first_len - skb_headlen(skb);
+ skb->len = first_len;
+ iph->tot_len = htons(first_len);
+ iph->frag_off = htons(IP_MF);
+ ip_send_check(iph);
+
+ for (;;) {
+ /* Prepare header of the next frame,
+ * before previous one went down. */
+ if (frag) {
+ frag->ip_summed = CHECKSUM_NONE;
+ frag->h.raw = frag->data;
+ frag->nh.raw = __skb_push(frag, hlen);
+ memcpy(frag->nh.raw, iph, hlen);
+ iph = frag->nh.iph;
+ iph->tot_len = htons(frag->len);
+ ip_copy_metadata(frag, skb);
+ if (offset == 0)
+ ip_options_fragment(frag);
+ offset += skb->len - hlen;
+ iph->frag_off = htons(offset>>3);
+ if (frag->next != NULL)
+ iph->frag_off |= htons(IP_MF);
+ /* Ready, complete checksum */
+ ip_send_check(iph);
+ }
+
+ err = output(skb);
+
+ if (err || !frag)
+ break;
+
+ skb = frag;
+ frag = skb->next;
+ skb->next = NULL;
+ }
+
+ if (err == 0) {
+ IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
+ return 0;
+ }
+
+ while (frag) {
+ skb = frag->next;
+ kfree_skb(frag);
+ frag = skb;
+ }
+ IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
+ return err;
+ }
+
+slow_path:
+ left = skb->len - hlen; /* Space per frame */
+ ptr = raw + hlen; /* Where to start from */
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+ /* for bridged IP traffic encapsulated inside f.e. a vlan header,
+ * we need to make room for the encapsulating header */
+ ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
+ mtu -= nf_bridge_pad(skb);
+#else
+ ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
+#endif
+ /*
+ * Fragment the datagram.
+ */
+
+ offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
+ not_last_frag = iph->frag_off & htons(IP_MF);
+
+ /*
+ * Keep copying data until we run out.
+ */
+
+ while(left > 0) {
+ len = left;
+ /* IF: it doesn't fit, use 'mtu' - the data space left */
+ if (len > mtu)
+ len = mtu;
+ /* IF: we are not sending upto and including the packet end
+ then align the next start on an eight byte boundary */
+ if (len < left) {
+ len &= ~7;
+ }
+ /*
+ * Allocate buffer.
+ */
+
+ if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
+ NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ /*
+ * Set up data on packet
+ */
+
+ ip_copy_metadata(skb2, skb);
+ skb_reserve(skb2, ll_rs);
+ skb_put(skb2, len + hlen);
+ skb2->nh.raw = skb2->data;
+ skb2->h.raw = skb2->data + hlen;
+
+ /*
+ * Charge the memory for the fragment to any owner
+ * it might possess
+ */
+
+ if (skb->sk)
+ skb_set_owner_w(skb2, skb->sk);
+
+ /*
+ * Copy the packet header into the new buffer.
+ */
+
+ memcpy(skb2->nh.raw, skb->data, hlen);
+
+ /*
+ * Copy a block of the IP datagram.
+ */
+ if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
+ BUG();
+ left -= len;
+
+ /*
+ * Fill in the new header fields.
+ */
+ iph = skb2->nh.iph;
+ iph->frag_off = htons((offset >> 3));
+
+ /* ANK: dirty, but effective trick. Upgrade options only if
+ * the segment to be fragmented was THE FIRST (otherwise,
+ * options are already fixed) and make it ONCE
+ * on the initial skb, so that all the following fragments
+ * will inherit fixed options.
+ */
+ if (offset == 0)
+ ip_options_fragment(skb);
+
+ /*
+ * Added AC : If we are fragmenting a fragment that's not the
+ * last fragment then keep MF on each bit
+ */
+ if (left > 0 || not_last_frag)
+ iph->frag_off |= htons(IP_MF);
+ ptr += len;
+ offset += len;
+
+ /*
+ * Put this fragment into the sending queue.
+ */
+
+ IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
+
+ iph->tot_len = htons(len + hlen);
+
+ ip_send_check(iph);
+
+ err = output(skb2);
+ if (err)
+ goto fail;
+ }
+ kfree_skb(skb);
+ IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
+ return err;
+
+fail:
+ kfree_skb(skb);
+ IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
+ return err;
+}
+
+int
+ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
+{
+ struct iovec *iov = from;
+
+ if (skb->ip_summed == CHECKSUM_HW) {
+ if (memcpy_fromiovecend(to, iov, offset, len) < 0)
+ return -EFAULT;
+ } else {
+ unsigned int csum = 0;
+ if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
+ return -EFAULT;
+ skb->csum = csum_block_add(skb->csum, csum, odd);
+ }
+ return 0;
+}
+
+static inline unsigned int
+csum_page(struct page *page, int offset, int copy)
+{
+ char *kaddr;
+ unsigned int csum;
+ kaddr = kmap(page);
+ csum = csum_partial(kaddr + offset, copy, 0);
+ kunmap(page);
+ return csum;
+}
+
+/*
+ * ip_append_data() and ip_append_page() can make one large IP datagram
+ * from many pieces of data. Each pieces will be holded on the socket
+ * until ip_push_pending_frames() is called. Each piece can be a page
+ * or non-page data.
+ *
+ * Not only UDP, other transport protocols - e.g. raw sockets - can use
+ * this interface potentially.
+ *
+ * LATER: length must be adjusted by pad at tail, when it is required.
+ */
+int ip_append_data(struct sock *sk,
+ int getfrag(void *from, char *to, int offset, int len,
+ int odd, struct sk_buff *skb),
+ void *from, int length, int transhdrlen,
+ struct ipcm_cookie *ipc, struct rtable *rt,
+ unsigned int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sk_buff *skb;
+
+ struct ip_options *opt = NULL;
+ int hh_len;
+ int exthdrlen;
+ int mtu;
+ int copy;
+ int err;
+ int offset = 0;
+ unsigned int maxfraglen, fragheaderlen;
+ int csummode = CHECKSUM_NONE;
+
+ if (flags&MSG_PROBE)
+ return 0;
+
+ if (skb_queue_empty(&sk->sk_write_queue)) {
+ /*
+ * setup for corking.
+ */
+ opt = ipc->opt;
+ if (opt) {
+ if (inet->cork.opt == NULL) {
+ inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
+ if (unlikely(inet->cork.opt == NULL))
+ return -ENOBUFS;
+ }
+ memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
+ inet->cork.flags |= IPCORK_OPT;
+ inet->cork.addr = ipc->addr;
+ }
+ dst_hold(&rt->u.dst);
+ inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
+ inet->cork.rt = rt;
+ inet->cork.length = 0;
+ sk->sk_sndmsg_page = NULL;
+ sk->sk_sndmsg_off = 0;
+ if ((exthdrlen = rt->u.dst.header_len) != 0) {
+ length += exthdrlen;
+ transhdrlen += exthdrlen;
+ }
+ } else {
+ rt = inet->cork.rt;
+ if (inet->cork.flags & IPCORK_OPT)
+ opt = inet->cork.opt;
+
+ transhdrlen = 0;
+ exthdrlen = 0;
+ mtu = inet->cork.fragsize;
+ }
+ hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
+
+ fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
+ maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
+
+ if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
+ ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
+ return -EMSGSIZE;
+ }
+
+ /*
+ * transhdrlen > 0 means that this is the first fragment and we wish
+ * it won't be fragmented in the future.
+ */
+ if (transhdrlen &&
+ length + fragheaderlen <= mtu &&
+ rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
+ !exthdrlen)
+ csummode = CHECKSUM_HW;
+
+ inet->cork.length += length;
+
+ /* So, what's going on in the loop below?
+ *
+ * We use calculated fragment length to generate chained skb,
+ * each of segments is IP fragment ready for sending to network after
+ * adding appropriate IP header.
+ */
+
+ if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
+ goto alloc_new_skb;
+
+ while (length > 0) {
+ /* Check if the remaining data fits into current packet. */
+ copy = mtu - skb->len;
+ if (copy < length)
+ copy = maxfraglen - skb->len;
+ if (copy <= 0) {
+ char *data;
+ unsigned int datalen;
+ unsigned int fraglen;
+ unsigned int fraggap;
+ unsigned int alloclen;
+ struct sk_buff *skb_prev;
+alloc_new_skb:
+ skb_prev = skb;
+ if (skb_prev)
+ fraggap = skb_prev->len - maxfraglen;
+ else
+ fraggap = 0;
+
+ /*
+ * If remaining data exceeds the mtu,
+ * we know we need more fragment(s).
+ */
+ datalen = length + fraggap;
+ if (datalen > mtu - fragheaderlen)
+ datalen = maxfraglen - fragheaderlen;
+ fraglen = datalen + fragheaderlen;
+
+ if ((flags & MSG_MORE) &&
+ !(rt->u.dst.dev->features&NETIF_F_SG))
+ alloclen = mtu;
+ else
+ alloclen = datalen + fragheaderlen;
+
+ /* The last fragment gets additional space at tail.
+ * Note, with MSG_MORE we overallocate on fragments,
+ * because we have no idea what fragment will be
+ * the last.
+ */
+ if (datalen == length)
+ alloclen += rt->u.dst.trailer_len;
+
+ if (transhdrlen) {
+ skb = sock_alloc_send_skb(sk,
+ alloclen + hh_len + 15,
+ (flags & MSG_DONTWAIT), &err);
+ } else {
+ skb = NULL;
+ if (atomic_read(&sk->sk_wmem_alloc) <=
+ 2 * sk->sk_sndbuf)
+ skb = sock_wmalloc(sk,
+ alloclen + hh_len + 15, 1,
+ sk->sk_allocation);
+ if (unlikely(skb == NULL))
+ err = -ENOBUFS;
+ }
+ if (skb == NULL)
+ goto error;
+
+ /*
+ * Fill in the control structures
+ */
+ skb->ip_summed = csummode;
+ skb->csum = 0;
+ skb_reserve(skb, hh_len);
+
+ /*
+ * Find where to start putting bytes.
+ */
+ data = skb_put(skb, fraglen);
+ skb->nh.raw = data + exthdrlen;
+ data += fragheaderlen;
+ skb->h.raw = data + exthdrlen;
+
+ if (fraggap) {
+ skb->csum = skb_copy_and_csum_bits(
+ skb_prev, maxfraglen,
+ data + transhdrlen, fraggap, 0);
+ skb_prev->csum = csum_sub(skb_prev->csum,
+ skb->csum);
+ data += fraggap;
+ skb_trim(skb_prev, maxfraglen);
+ }
+
+ copy = datalen - transhdrlen - fraggap;
+ if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
+ err = -EFAULT;
+ kfree_skb(skb);
+ goto error;
+ }
+
+ offset += copy;
+ length -= datalen - fraggap;
+ transhdrlen = 0;
+ exthdrlen = 0;
+ csummode = CHECKSUM_NONE;
+
+ /*
+ * Put the packet on the pending queue.
+ */
+ __skb_queue_tail(&sk->sk_write_queue, skb);
+ continue;
+ }
+
+ if (copy > length)
+ copy = length;
+
+ if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
+ unsigned int off;
+
+ off = skb->len;
+ if (getfrag(from, skb_put(skb, copy),
+ offset, copy, off, skb) < 0) {
+ __skb_trim(skb, off);
+ err = -EFAULT;
+ goto error;
+ }
+ } else {
+ int i = skb_shinfo(skb)->nr_frags;
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
+ struct page *page = sk->sk_sndmsg_page;
+ int off = sk->sk_sndmsg_off;
+ unsigned int left;
+
+ if (page && (left = PAGE_SIZE - off) > 0) {
+ if (copy >= left)
+ copy = left;
+ if (page != frag->page) {
+ if (i == MAX_SKB_FRAGS) {
+ err = -EMSGSIZE;
+ goto error;
+ }
+ get_page(page);
+ skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
+ frag = &skb_shinfo(skb)->frags[i];
+ }
+ } else if (i < MAX_SKB_FRAGS) {
+ if (copy > PAGE_SIZE)
+ copy = PAGE_SIZE;
+ page = alloc_pages(sk->sk_allocation, 0);
+ if (page == NULL) {
+ err = -ENOMEM;
+ goto error;
+ }
+ sk->sk_sndmsg_page = page;
+ sk->sk_sndmsg_off = 0;
+
+ skb_fill_page_desc(skb, i, page, 0, 0);
+ frag = &skb_shinfo(skb)->frags[i];
+ skb->truesize += PAGE_SIZE;
+ atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
+ } else {
+ err = -EMSGSIZE;
+ goto error;
+ }
+ if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
+ err = -EFAULT;
+ goto error;
+ }
+ sk->sk_sndmsg_off += copy;
+ frag->size += copy;
+ skb->len += copy;
+ skb->data_len += copy;
+ }
+ offset += copy;
+ length -= copy;
+ }
+
+ return 0;
+
+error:
+ inet->cork.length -= length;
+ IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
+ return err;
+}
+
+ssize_t ip_append_page(struct sock *sk, struct page *page,
+ int offset, size_t size, int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sk_buff *skb;
+ struct rtable *rt;
+ struct ip_options *opt = NULL;
+ int hh_len;
+ int mtu;
+ int len;
+ int err;
+ unsigned int maxfraglen, fragheaderlen, fraggap;
+
+ if (inet->hdrincl)
+ return -EPERM;
+
+ if (flags&MSG_PROBE)
+ return 0;
+
+ if (skb_queue_empty(&sk->sk_write_queue))
+ return -EINVAL;
+
+ rt = inet->cork.rt;
+ if (inet->cork.flags & IPCORK_OPT)
+ opt = inet->cork.opt;
+
+ if (!(rt->u.dst.dev->features&NETIF_F_SG))
+ return -EOPNOTSUPP;
+
+ hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
+ mtu = inet->cork.fragsize;
+
+ fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
+ maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
+
+ if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
+ ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
+ return -EMSGSIZE;
+ }
+
+ if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
+ return -EINVAL;
+
+ inet->cork.length += size;
+
+ while (size > 0) {
+ int i;
+
+ /* Check if the remaining data fits into current packet. */
+ len = mtu - skb->len;
+ if (len < size)
+ len = maxfraglen - skb->len;
+ if (len <= 0) {
+ struct sk_buff *skb_prev;
+ char *data;
+ struct iphdr *iph;
+ int alloclen;
+
+ skb_prev = skb;
+ if (skb_prev)
+ fraggap = skb_prev->len - maxfraglen;
+ else
+ fraggap = 0;
+
+ alloclen = fragheaderlen + hh_len + fraggap + 15;
+ skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
+ if (unlikely(!skb)) {
+ err = -ENOBUFS;
+ goto error;
+ }
+
+ /*
+ * Fill in the control structures
+ */
+ skb->ip_summed = CHECKSUM_NONE;
+ skb->csum = 0;
+ skb_reserve(skb, hh_len);
+
+ /*
+ * Find where to start putting bytes.
+ */
+ data = skb_put(skb, fragheaderlen + fraggap);
+ skb->nh.iph = iph = (struct iphdr *)data;
+ data += fragheaderlen;
+ skb->h.raw = data;
+
+ if (fraggap) {
+ skb->csum = skb_copy_and_csum_bits(
+ skb_prev, maxfraglen,
+ data, fraggap, 0);
+ skb_prev->csum = csum_sub(skb_prev->csum,
+ skb->csum);
+ skb_trim(skb_prev, maxfraglen);
+ }
+
+ /*
+ * Put the packet on the pending queue.
+ */
+ __skb_queue_tail(&sk->sk_write_queue, skb);
+ continue;
+ }
+
+ i = skb_shinfo(skb)->nr_frags;
+ if (len > size)
+ len = size;
+ if (skb_can_coalesce(skb, i, page, offset)) {
+ skb_shinfo(skb)->frags[i-1].size += len;
+ } else if (i < MAX_SKB_FRAGS) {
+ get_page(page);
+ skb_fill_page_desc(skb, i, page, offset, len);
+ } else {
+ err = -EMSGSIZE;
+ goto error;
+ }
+
+ if (skb->ip_summed == CHECKSUM_NONE) {
+ unsigned int csum;
+ csum = csum_page(page, offset, len);
+ skb->csum = csum_block_add(skb->csum, csum, skb->len);
+ }
+
+ skb->len += len;
+ skb->data_len += len;
+ offset += len;
+ size -= len;
+ }
+ return 0;
+
+error:
+ inet->cork.length -= size;
+ IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
+ return err;
+}
+
+/*
+ * Combined all pending IP fragments on the socket as one IP datagram
+ * and push them out.
+ */
+int ip_push_pending_frames(struct sock *sk)
+{
+ struct sk_buff *skb, *tmp_skb;
+ struct sk_buff **tail_skb;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_options *opt = NULL;
+ struct rtable *rt = inet->cork.rt;
+ struct iphdr *iph;
+ int df = 0;
+ __u8 ttl;
+ int err = 0;
+
+ if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
+ goto out;
+ tail_skb = &(skb_shinfo(skb)->frag_list);
+
+ /* move skb->data to ip header from ext header */
+ if (skb->data < skb->nh.raw)
+ __skb_pull(skb, skb->nh.raw - skb->data);
+ while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
+ __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
+ *tail_skb = tmp_skb;
+ tail_skb = &(tmp_skb->next);
+ skb->len += tmp_skb->len;
+ skb->data_len += tmp_skb->len;
+ skb->truesize += tmp_skb->truesize;
+ __sock_put(tmp_skb->sk);
+ tmp_skb->destructor = NULL;
+ tmp_skb->sk = NULL;
+ }
+
+ /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
+ * to fragment the frame generated here. No matter, what transforms
+ * how transforms change size of the packet, it will come out.
+ */
+ if (inet->pmtudisc != IP_PMTUDISC_DO)
+ skb->local_df = 1;
+
+ /* DF bit is set when we want to see DF on outgoing frames.
+ * If local_df is set too, we still allow to fragment this frame
+ * locally. */
+ if (inet->pmtudisc == IP_PMTUDISC_DO ||
+ (skb->len <= dst_mtu(&rt->u.dst) &&
+ ip_dont_fragment(sk, &rt->u.dst)))
+ df = htons(IP_DF);
+
+ if (inet->cork.flags & IPCORK_OPT)
+ opt = inet->cork.opt;
+
+ if (rt->rt_type == RTN_MULTICAST)
+ ttl = inet->mc_ttl;
+ else
+ ttl = ip_select_ttl(inet, &rt->u.dst);
+
+ iph = (struct iphdr *)skb->data;
+ iph->version = 4;
+ iph->ihl = 5;
+ if (opt) {
+ iph->ihl += opt->optlen>>2;
+ ip_options_build(skb, opt, inet->cork.addr, rt, 0);
+ }
+ iph->tos = inet->tos;
+ iph->tot_len = htons(skb->len);
+ iph->frag_off = df;
+ if (!df) {
+ __ip_select_ident(iph, &rt->u.dst, 0);
+ } else {
+ iph->id = htons(inet->id++);
+ }
+ iph->ttl = ttl;
+ iph->protocol = sk->sk_protocol;
+ iph->saddr = rt->rt_src;
+ iph->daddr = rt->rt_dst;
+ ip_send_check(iph);
+
+ skb->priority = sk->sk_priority;
+ skb->dst = dst_clone(&rt->u.dst);
+
+ /* Netfilter gets whole the not fragmented skb. */
+ err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
+ skb->dst->dev, dst_output);
+ if (err) {
+ if (err > 0)
+ err = inet->recverr ? net_xmit_errno(err) : 0;
+ if (err)
+ goto error;
+ }
+
+out:
+ inet->cork.flags &= ~IPCORK_OPT;
+ if (inet->cork.opt) {
+ kfree(inet->cork.opt);
+ inet->cork.opt = NULL;
+ }
+ if (inet->cork.rt) {
+ ip_rt_put(inet->cork.rt);
+ inet->cork.rt = NULL;
+ }
+ return err;
+
+error:
+ IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
+ goto out;
+}
+
+/*
+ * Throw away all pending data on the socket.
+ */
+void ip_flush_pending_frames(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sk_buff *skb;
+
+ while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
+ kfree_skb(skb);
+
+ inet->cork.flags &= ~IPCORK_OPT;
+ if (inet->cork.opt) {
+ kfree(inet->cork.opt);
+ inet->cork.opt = NULL;
+ }
+ if (inet->cork.rt) {
+ ip_rt_put(inet->cork.rt);
+ inet->cork.rt = NULL;
+ }
+}
+
+
+/*
+ * Fetch data from kernel space and fill in checksum if needed.
+ */
+static int ip_reply_glue_bits(void *dptr, char *to, int offset,
+ int len, int odd, struct sk_buff *skb)
+{
+ unsigned int csum;
+
+ csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
+ skb->csum = csum_block_add(skb->csum, csum, odd);
+ return 0;
+}
+
+/*
+ * Generic function to send a packet as reply to another packet.
+ * Used to send TCP resets so far. ICMP should use this function too.
+ *
+ * Should run single threaded per socket because it uses the sock
+ * structure to pass arguments.
+ *
+ * LATER: switch from ip_build_xmit to ip_append_*
+ */
+void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
+ unsigned int len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct {
+ struct ip_options opt;
+ char data[40];
+ } replyopts;
+ struct ipcm_cookie ipc;
+ u32 daddr;
+ struct rtable *rt = (struct rtable*)skb->dst;
+
+ if (ip_options_echo(&replyopts.opt, skb))
+ return;
+
+ daddr = ipc.addr = rt->rt_src;
+ ipc.opt = NULL;
+
+ if (replyopts.opt.optlen) {
+ ipc.opt = &replyopts.opt;
+
+ if (ipc.opt->srr)
+ daddr = replyopts.opt.faddr;
+ }
+
+ {
+ struct flowi fl = { .nl_u = { .ip4_u =
+ { .daddr = daddr,
+ .saddr = rt->rt_spec_dst,
+ .tos = RT_TOS(skb->nh.iph->tos) } },
+ /* Not quite clean, but right. */
+ .uli_u = { .ports =
+ { .sport = skb->h.th->dest,
+ .dport = skb->h.th->source } },
+ .proto = sk->sk_protocol };
+ if (ip_route_output_key(&rt, &fl))
+ return;
+ }
+
+ /* And let IP do all the hard work.
+
+ This chunk is not reenterable, hence spinlock.
+ Note that it uses the fact, that this function is called
+ with locally disabled BH and that sk cannot be already spinlocked.
+ */
+ bh_lock_sock(sk);
+ inet->tos = skb->nh.iph->tos;
+ sk->sk_priority = skb->priority;
+ sk->sk_protocol = skb->nh.iph->protocol;
+ ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
+ &ipc, rt, MSG_DONTWAIT);
+ if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
+ if (arg->csumoffset >= 0)
+ *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
+ skb->ip_summed = CHECKSUM_NONE;
+ ip_push_pending_frames(sk);
+ }
+
+ bh_unlock_sock(sk);
+
+ ip_rt_put(rt);
+}
+
+/*
+ * IP protocol layer initialiser
+ */
+
+static struct packet_type ip_packet_type = {
+ .type = __constant_htons(ETH_P_IP),
+ .func = ip_rcv,
+};
+
+/*
+ * IP registers the packet type and then calls the subprotocol initialisers
+ */
+
+void __init ip_init(void)
+{
+ dev_add_pack(&ip_packet_type);
+
+ ip_rt_init();
+ inet_initpeers();
+
+#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
+ igmp_mc_proc_init();
+#endif
+}
+
+EXPORT_SYMBOL(ip_finish_output);
+EXPORT_SYMBOL(ip_fragment);
+EXPORT_SYMBOL(ip_generic_getfrag);
+EXPORT_SYMBOL(ip_queue_xmit);
+EXPORT_SYMBOL(ip_send_check);
+
+#ifdef CONFIG_SYSCTL
+EXPORT_SYMBOL(sysctl_ip_default_ttl);
+#endif
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
new file mode 100644
index 000000000000..47012b93cad2
--- /dev/null
+++ b/net/ipv4/ip_sockglue.c
@@ -0,0 +1,1093 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The IP to API glue.
+ *
+ * Version: $Id: ip_sockglue.c,v 1.62 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors: see ip.c
+ *
+ * Fixes:
+ * Many : Split from ip.c , see ip.c for history.
+ * Martin Mares : TOS setting fixed.
+ * Alan Cox : Fixed a couple of oopses in Martin's
+ * TOS tweaks.
+ * Mike McLagan : Routing by source
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/igmp.h>
+#include <linux/netfilter.h>
+#include <linux/route.h>
+#include <linux/mroute.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#include <net/transp_v6.h>
+#endif
+
+#include <linux/errqueue.h>
+#include <asm/uaccess.h>
+
+#define IP_CMSG_PKTINFO 1
+#define IP_CMSG_TTL 2
+#define IP_CMSG_TOS 4
+#define IP_CMSG_RECVOPTS 8
+#define IP_CMSG_RETOPTS 16
+
+/*
+ * SOL_IP control messages.
+ */
+
+static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
+{
+ struct in_pktinfo info;
+ struct rtable *rt = (struct rtable *)skb->dst;
+
+ info.ipi_addr.s_addr = skb->nh.iph->daddr;
+ if (rt) {
+ info.ipi_ifindex = rt->rt_iif;
+ info.ipi_spec_dst.s_addr = rt->rt_spec_dst;
+ } else {
+ info.ipi_ifindex = 0;
+ info.ipi_spec_dst.s_addr = 0;
+ }
+
+ put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
+}
+
+static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb)
+{
+ int ttl = skb->nh.iph->ttl;
+ put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl);
+}
+
+static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb)
+{
+ put_cmsg(msg, SOL_IP, IP_TOS, 1, &skb->nh.iph->tos);
+}
+
+static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
+{
+ if (IPCB(skb)->opt.optlen == 0)
+ return;
+
+ put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen, skb->nh.iph+1);
+}
+
+
+static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
+{
+ unsigned char optbuf[sizeof(struct ip_options) + 40];
+ struct ip_options * opt = (struct ip_options*)optbuf;
+
+ if (IPCB(skb)->opt.optlen == 0)
+ return;
+
+ if (ip_options_echo(opt, skb)) {
+ msg->msg_flags |= MSG_CTRUNC;
+ return;
+ }
+ ip_options_undo(opt);
+
+ put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data);
+}
+
+
+void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
+{
+ struct inet_sock *inet = inet_sk(skb->sk);
+ unsigned flags = inet->cmsg_flags;
+
+ /* Ordered by supposed usage frequency */
+ if (flags & 1)
+ ip_cmsg_recv_pktinfo(msg, skb);
+ if ((flags>>=1) == 0)
+ return;
+
+ if (flags & 1)
+ ip_cmsg_recv_ttl(msg, skb);
+ if ((flags>>=1) == 0)
+ return;
+
+ if (flags & 1)
+ ip_cmsg_recv_tos(msg, skb);
+ if ((flags>>=1) == 0)
+ return;
+
+ if (flags & 1)
+ ip_cmsg_recv_opts(msg, skb);
+ if ((flags>>=1) == 0)
+ return;
+
+ if (flags & 1)
+ ip_cmsg_recv_retopts(msg, skb);
+}
+
+int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc)
+{
+ int err;
+ struct cmsghdr *cmsg;
+
+ for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+ if (cmsg->cmsg_level != SOL_IP)
+ continue;
+ switch (cmsg->cmsg_type) {
+ case IP_RETOPTS:
+ err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+ err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40, 0);
+ if (err)
+ return err;
+ break;
+ case IP_PKTINFO:
+ {
+ struct in_pktinfo *info;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo)))
+ return -EINVAL;
+ info = (struct in_pktinfo *)CMSG_DATA(cmsg);
+ ipc->oif = info->ipi_ifindex;
+ ipc->addr = info->ipi_spec_dst.s_addr;
+ break;
+ }
+ default:
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+
+/* Special input handler for packets caught by router alert option.
+ They are selected only by protocol field, and then processed likely
+ local ones; but only if someone wants them! Otherwise, router
+ not running rsvpd will kill RSVP.
+
+ It is user level problem, what it will make with them.
+ I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
+ but receiver should be enough clever f.e. to forward mtrace requests,
+ sent to multicast group to reach destination designated router.
+ */
+struct ip_ra_chain *ip_ra_chain;
+DEFINE_RWLOCK(ip_ra_lock);
+
+int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *))
+{
+ struct ip_ra_chain *ra, *new_ra, **rap;
+
+ if (sk->sk_type != SOCK_RAW || inet_sk(sk)->num == IPPROTO_RAW)
+ return -EINVAL;
+
+ new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
+
+ write_lock_bh(&ip_ra_lock);
+ for (rap = &ip_ra_chain; (ra=*rap) != NULL; rap = &ra->next) {
+ if (ra->sk == sk) {
+ if (on) {
+ write_unlock_bh(&ip_ra_lock);
+ if (new_ra)
+ kfree(new_ra);
+ return -EADDRINUSE;
+ }
+ *rap = ra->next;
+ write_unlock_bh(&ip_ra_lock);
+
+ if (ra->destructor)
+ ra->destructor(sk);
+ sock_put(sk);
+ kfree(ra);
+ return 0;
+ }
+ }
+ if (new_ra == NULL) {
+ write_unlock_bh(&ip_ra_lock);
+ return -ENOBUFS;
+ }
+ new_ra->sk = sk;
+ new_ra->destructor = destructor;
+
+ new_ra->next = ra;
+ *rap = new_ra;
+ sock_hold(sk);
+ write_unlock_bh(&ip_ra_lock);
+
+ return 0;
+}
+
+void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
+ u16 port, u32 info, u8 *payload)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sock_exterr_skb *serr;
+
+ if (!inet->recverr)
+ return;
+
+ skb = skb_clone(skb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ serr = SKB_EXT_ERR(skb);
+ serr->ee.ee_errno = err;
+ serr->ee.ee_origin = SO_EE_ORIGIN_ICMP;
+ serr->ee.ee_type = skb->h.icmph->type;
+ serr->ee.ee_code = skb->h.icmph->code;
+ serr->ee.ee_pad = 0;
+ serr->ee.ee_info = info;
+ serr->ee.ee_data = 0;
+ serr->addr_offset = (u8*)&(((struct iphdr*)(skb->h.icmph+1))->daddr) - skb->nh.raw;
+ serr->port = port;
+
+ skb->h.raw = payload;
+ if (!skb_pull(skb, payload - skb->data) ||
+ sock_queue_err_skb(sk, skb))
+ kfree_skb(skb);
+}
+
+void ip_local_error(struct sock *sk, int err, u32 daddr, u16 port, u32 info)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sock_exterr_skb *serr;
+ struct iphdr *iph;
+ struct sk_buff *skb;
+
+ if (!inet->recverr)
+ return;
+
+ skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ iph = (struct iphdr*)skb_put(skb, sizeof(struct iphdr));
+ skb->nh.iph = iph;
+ iph->daddr = daddr;
+
+ serr = SKB_EXT_ERR(skb);
+ serr->ee.ee_errno = err;
+ serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+ serr->ee.ee_type = 0;
+ serr->ee.ee_code = 0;
+ serr->ee.ee_pad = 0;
+ serr->ee.ee_info = info;
+ serr->ee.ee_data = 0;
+ serr->addr_offset = (u8*)&iph->daddr - skb->nh.raw;
+ serr->port = port;
+
+ skb->h.raw = skb->tail;
+ __skb_pull(skb, skb->tail - skb->data);
+
+ if (sock_queue_err_skb(sk, skb))
+ kfree_skb(skb);
+}
+
+/*
+ * Handle MSG_ERRQUEUE
+ */
+int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
+{
+ struct sock_exterr_skb *serr;
+ struct sk_buff *skb, *skb2;
+ struct sockaddr_in *sin;
+ struct {
+ struct sock_extended_err ee;
+ struct sockaddr_in offender;
+ } errhdr;
+ int err;
+ int copied;
+
+ err = -EAGAIN;
+ skb = skb_dequeue(&sk->sk_error_queue);
+ if (skb == NULL)
+ goto out;
+
+ copied = skb->len;
+ if (copied > len) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+ err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+ if (err)
+ goto out_free_skb;
+
+ sock_recv_timestamp(msg, sk, skb);
+
+ serr = SKB_EXT_ERR(skb);
+
+ sin = (struct sockaddr_in *)msg->msg_name;
+ if (sin) {
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = *(u32*)(skb->nh.raw + serr->addr_offset);
+ sin->sin_port = serr->port;
+ memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+ }
+
+ memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
+ sin = &errhdr.offender;
+ sin->sin_family = AF_UNSPEC;
+ if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = skb->nh.iph->saddr;
+ sin->sin_port = 0;
+ memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+ if (inet->cmsg_flags)
+ ip_cmsg_recv(msg, skb);
+ }
+
+ put_cmsg(msg, SOL_IP, IP_RECVERR, sizeof(errhdr), &errhdr);
+
+ /* Now we could try to dump offended packet options */
+
+ msg->msg_flags |= MSG_ERRQUEUE;
+ err = copied;
+
+ /* Reset and regenerate socket error */
+ spin_lock_irq(&sk->sk_error_queue.lock);
+ sk->sk_err = 0;
+ if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
+ sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
+ spin_unlock_irq(&sk->sk_error_queue.lock);
+ sk->sk_error_report(sk);
+ } else
+ spin_unlock_irq(&sk->sk_error_queue.lock);
+
+out_free_skb:
+ kfree_skb(skb);
+out:
+ return err;
+}
+
+
+/*
+ * Socket option code for IP. This is the end of the line after any TCP,UDP etc options on
+ * an IP socket.
+ */
+
+int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ int val=0,err;
+
+ if (level != SOL_IP)
+ return -ENOPROTOOPT;
+
+ if (((1<<optname) & ((1<<IP_PKTINFO) | (1<<IP_RECVTTL) |
+ (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) |
+ (1<<IP_RETOPTS) | (1<<IP_TOS) |
+ (1<<IP_TTL) | (1<<IP_HDRINCL) |
+ (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) |
+ (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND))) ||
+ optname == IP_MULTICAST_TTL ||
+ optname == IP_MULTICAST_LOOP) {
+ if (optlen >= sizeof(int)) {
+ if (get_user(val, (int __user *) optval))
+ return -EFAULT;
+ } else if (optlen >= sizeof(char)) {
+ unsigned char ucval;
+
+ if (get_user(ucval, (unsigned char __user *) optval))
+ return -EFAULT;
+ val = (int) ucval;
+ }
+ }
+
+ /* If optlen==0, it is equivalent to val == 0 */
+
+#ifdef CONFIG_IP_MROUTE
+ if (optname >= MRT_BASE && optname <= (MRT_BASE + 10))
+ return ip_mroute_setsockopt(sk,optname,optval,optlen);
+#endif
+
+ err = 0;
+ lock_sock(sk);
+
+ switch (optname) {
+ case IP_OPTIONS:
+ {
+ struct ip_options * opt = NULL;
+ if (optlen > 40 || optlen < 0)
+ goto e_inval;
+ err = ip_options_get(&opt, optval, optlen, 1);
+ if (err)
+ break;
+ if (sk->sk_type == SOCK_STREAM) {
+ struct tcp_sock *tp = tcp_sk(sk);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ if (sk->sk_family == PF_INET ||
+ (!((1 << sk->sk_state) &
+ (TCPF_LISTEN | TCPF_CLOSE)) &&
+ inet->daddr != LOOPBACK4_IPV6)) {
+#endif
+ if (inet->opt)
+ tp->ext_header_len -= inet->opt->optlen;
+ if (opt)
+ tp->ext_header_len += opt->optlen;
+ tcp_sync_mss(sk, tp->pmtu_cookie);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ }
+#endif
+ }
+ opt = xchg(&inet->opt, opt);
+ if (opt)
+ kfree(opt);
+ break;
+ }
+ case IP_PKTINFO:
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_PKTINFO;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_PKTINFO;
+ break;
+ case IP_RECVTTL:
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_TTL;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_TTL;
+ break;
+ case IP_RECVTOS:
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_TOS;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_TOS;
+ break;
+ case IP_RECVOPTS:
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_RECVOPTS;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_RECVOPTS;
+ break;
+ case IP_RETOPTS:
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_RETOPTS;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_RETOPTS;
+ break;
+ case IP_TOS: /* This sets both TOS and Precedence */
+ if (sk->sk_type == SOCK_STREAM) {
+ val &= ~3;
+ val |= inet->tos & 3;
+ }
+ if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP &&
+ !capable(CAP_NET_ADMIN)) {
+ err = -EPERM;
+ break;
+ }
+ if (inet->tos != val) {
+ inet->tos = val;
+ sk->sk_priority = rt_tos2priority(val);
+ sk_dst_reset(sk);
+ }
+ break;
+ case IP_TTL:
+ if (optlen<1)
+ goto e_inval;
+ if (val != -1 && (val < 1 || val>255))
+ goto e_inval;
+ inet->uc_ttl = val;
+ break;
+ case IP_HDRINCL:
+ if (sk->sk_type != SOCK_RAW) {
+ err = -ENOPROTOOPT;
+ break;
+ }
+ inet->hdrincl = val ? 1 : 0;
+ break;
+ case IP_MTU_DISCOVER:
+ if (val<0 || val>2)
+ goto e_inval;
+ inet->pmtudisc = val;
+ break;
+ case IP_RECVERR:
+ inet->recverr = !!val;
+ if (!val)
+ skb_queue_purge(&sk->sk_error_queue);
+ break;
+ case IP_MULTICAST_TTL:
+ if (sk->sk_type == SOCK_STREAM)
+ goto e_inval;
+ if (optlen<1)
+ goto e_inval;
+ if (val==-1)
+ val = 1;
+ if (val < 0 || val > 255)
+ goto e_inval;
+ inet->mc_ttl = val;
+ break;
+ case IP_MULTICAST_LOOP:
+ if (optlen<1)
+ goto e_inval;
+ inet->mc_loop = !!val;
+ break;
+ case IP_MULTICAST_IF:
+ {
+ struct ip_mreqn mreq;
+ struct net_device *dev = NULL;
+
+ if (sk->sk_type == SOCK_STREAM)
+ goto e_inval;
+ /*
+ * Check the arguments are allowable
+ */
+
+ err = -EFAULT;
+ if (optlen >= sizeof(struct ip_mreqn)) {
+ if (copy_from_user(&mreq,optval,sizeof(mreq)))
+ break;
+ } else {
+ memset(&mreq, 0, sizeof(mreq));
+ if (optlen >= sizeof(struct in_addr) &&
+ copy_from_user(&mreq.imr_address,optval,sizeof(struct in_addr)))
+ break;
+ }
+
+ if (!mreq.imr_ifindex) {
+ if (mreq.imr_address.s_addr == INADDR_ANY) {
+ inet->mc_index = 0;
+ inet->mc_addr = 0;
+ err = 0;
+ break;
+ }
+ dev = ip_dev_find(mreq.imr_address.s_addr);
+ if (dev) {
+ mreq.imr_ifindex = dev->ifindex;
+ dev_put(dev);
+ }
+ } else
+ dev = __dev_get_by_index(mreq.imr_ifindex);
+
+
+ err = -EADDRNOTAVAIL;
+ if (!dev)
+ break;
+
+ err = -EINVAL;
+ if (sk->sk_bound_dev_if &&
+ mreq.imr_ifindex != sk->sk_bound_dev_if)
+ break;
+
+ inet->mc_index = mreq.imr_ifindex;
+ inet->mc_addr = mreq.imr_address.s_addr;
+ err = 0;
+ break;
+ }
+
+ case IP_ADD_MEMBERSHIP:
+ case IP_DROP_MEMBERSHIP:
+ {
+ struct ip_mreqn mreq;
+
+ if (optlen < sizeof(struct ip_mreq))
+ goto e_inval;
+ err = -EFAULT;
+ if (optlen >= sizeof(struct ip_mreqn)) {
+ if(copy_from_user(&mreq,optval,sizeof(mreq)))
+ break;
+ } else {
+ memset(&mreq, 0, sizeof(mreq));
+ if (copy_from_user(&mreq,optval,sizeof(struct ip_mreq)))
+ break;
+ }
+
+ if (optname == IP_ADD_MEMBERSHIP)
+ err = ip_mc_join_group(sk, &mreq);
+ else
+ err = ip_mc_leave_group(sk, &mreq);
+ break;
+ }
+ case IP_MSFILTER:
+ {
+ extern int sysctl_optmem_max;
+ extern int sysctl_igmp_max_msf;
+ struct ip_msfilter *msf;
+
+ if (optlen < IP_MSFILTER_SIZE(0))
+ goto e_inval;
+ if (optlen > sysctl_optmem_max) {
+ err = -ENOBUFS;
+ break;
+ }
+ msf = (struct ip_msfilter *)kmalloc(optlen, GFP_KERNEL);
+ if (msf == 0) {
+ err = -ENOBUFS;
+ break;
+ }
+ err = -EFAULT;
+ if (copy_from_user(msf, optval, optlen)) {
+ kfree(msf);
+ break;
+ }
+ /* numsrc >= (1G-4) overflow in 32 bits */
+ if (msf->imsf_numsrc >= 0x3ffffffcU ||
+ msf->imsf_numsrc > sysctl_igmp_max_msf) {
+ kfree(msf);
+ err = -ENOBUFS;
+ break;
+ }
+ if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) {
+ kfree(msf);
+ err = -EINVAL;
+ break;
+ }
+ err = ip_mc_msfilter(sk, msf, 0);
+ kfree(msf);
+ break;
+ }
+ case IP_BLOCK_SOURCE:
+ case IP_UNBLOCK_SOURCE:
+ case IP_ADD_SOURCE_MEMBERSHIP:
+ case IP_DROP_SOURCE_MEMBERSHIP:
+ {
+ struct ip_mreq_source mreqs;
+ int omode, add;
+
+ if (optlen != sizeof(struct ip_mreq_source))
+ goto e_inval;
+ if (copy_from_user(&mreqs, optval, sizeof(mreqs))) {
+ err = -EFAULT;
+ break;
+ }
+ if (optname == IP_BLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 1;
+ } else if (optname == IP_UNBLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 0;
+ } else if (optname == IP_ADD_SOURCE_MEMBERSHIP) {
+ struct ip_mreqn mreq;
+
+ mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr;
+ mreq.imr_address.s_addr = mreqs.imr_interface;
+ mreq.imr_ifindex = 0;
+ err = ip_mc_join_group(sk, &mreq);
+ if (err)
+ break;
+ omode = MCAST_INCLUDE;
+ add = 1;
+ } else /*IP_DROP_SOURCE_MEMBERSHIP */ {
+ omode = MCAST_INCLUDE;
+ add = 0;
+ }
+ err = ip_mc_source(add, omode, sk, &mreqs, 0);
+ break;
+ }
+ case MCAST_JOIN_GROUP:
+ case MCAST_LEAVE_GROUP:
+ {
+ struct group_req greq;
+ struct sockaddr_in *psin;
+ struct ip_mreqn mreq;
+
+ if (optlen < sizeof(struct group_req))
+ goto e_inval;
+ err = -EFAULT;
+ if(copy_from_user(&greq, optval, sizeof(greq)))
+ break;
+ psin = (struct sockaddr_in *)&greq.gr_group;
+ if (psin->sin_family != AF_INET)
+ goto e_inval;
+ memset(&mreq, 0, sizeof(mreq));
+ mreq.imr_multiaddr = psin->sin_addr;
+ mreq.imr_ifindex = greq.gr_interface;
+
+ if (optname == MCAST_JOIN_GROUP)
+ err = ip_mc_join_group(sk, &mreq);
+ else
+ err = ip_mc_leave_group(sk, &mreq);
+ break;
+ }
+ case MCAST_JOIN_SOURCE_GROUP:
+ case MCAST_LEAVE_SOURCE_GROUP:
+ case MCAST_BLOCK_SOURCE:
+ case MCAST_UNBLOCK_SOURCE:
+ {
+ struct group_source_req greqs;
+ struct ip_mreq_source mreqs;
+ struct sockaddr_in *psin;
+ int omode, add;
+
+ if (optlen != sizeof(struct group_source_req))
+ goto e_inval;
+ if (copy_from_user(&greqs, optval, sizeof(greqs))) {
+ err = -EFAULT;
+ break;
+ }
+ if (greqs.gsr_group.ss_family != AF_INET ||
+ greqs.gsr_source.ss_family != AF_INET) {
+ err = -EADDRNOTAVAIL;
+ break;
+ }
+ psin = (struct sockaddr_in *)&greqs.gsr_group;
+ mreqs.imr_multiaddr = psin->sin_addr.s_addr;
+ psin = (struct sockaddr_in *)&greqs.gsr_source;
+ mreqs.imr_sourceaddr = psin->sin_addr.s_addr;
+ mreqs.imr_interface = 0; /* use index for mc_source */
+
+ if (optname == MCAST_BLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 1;
+ } else if (optname == MCAST_UNBLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 0;
+ } else if (optname == MCAST_JOIN_SOURCE_GROUP) {
+ struct ip_mreqn mreq;
+
+ psin = (struct sockaddr_in *)&greqs.gsr_group;
+ mreq.imr_multiaddr = psin->sin_addr;
+ mreq.imr_address.s_addr = 0;
+ mreq.imr_ifindex = greqs.gsr_interface;
+ err = ip_mc_join_group(sk, &mreq);
+ if (err)
+ break;
+ greqs.gsr_interface = mreq.imr_ifindex;
+ omode = MCAST_INCLUDE;
+ add = 1;
+ } else /* MCAST_LEAVE_SOURCE_GROUP */ {
+ omode = MCAST_INCLUDE;
+ add = 0;
+ }
+ err = ip_mc_source(add, omode, sk, &mreqs,
+ greqs.gsr_interface);
+ break;
+ }
+ case MCAST_MSFILTER:
+ {
+ extern int sysctl_optmem_max;
+ extern int sysctl_igmp_max_msf;
+ struct sockaddr_in *psin;
+ struct ip_msfilter *msf = NULL;
+ struct group_filter *gsf = NULL;
+ int msize, i, ifindex;
+
+ if (optlen < GROUP_FILTER_SIZE(0))
+ goto e_inval;
+ if (optlen > sysctl_optmem_max) {
+ err = -ENOBUFS;
+ break;
+ }
+ gsf = (struct group_filter *)kmalloc(optlen,GFP_KERNEL);
+ if (gsf == 0) {
+ err = -ENOBUFS;
+ break;
+ }
+ err = -EFAULT;
+ if (copy_from_user(gsf, optval, optlen)) {
+ goto mc_msf_out;
+ }
+ /* numsrc >= (4G-140)/128 overflow in 32 bits */
+ if (gsf->gf_numsrc >= 0x1ffffff ||
+ gsf->gf_numsrc > sysctl_igmp_max_msf) {
+ err = -ENOBUFS;
+ goto mc_msf_out;
+ }
+ if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) {
+ err = -EINVAL;
+ goto mc_msf_out;
+ }
+ msize = IP_MSFILTER_SIZE(gsf->gf_numsrc);
+ msf = (struct ip_msfilter *)kmalloc(msize,GFP_KERNEL);
+ if (msf == 0) {
+ err = -ENOBUFS;
+ goto mc_msf_out;
+ }
+ ifindex = gsf->gf_interface;
+ psin = (struct sockaddr_in *)&gsf->gf_group;
+ if (psin->sin_family != AF_INET) {
+ err = -EADDRNOTAVAIL;
+ goto mc_msf_out;
+ }
+ msf->imsf_multiaddr = psin->sin_addr.s_addr;
+ msf->imsf_interface = 0;
+ msf->imsf_fmode = gsf->gf_fmode;
+ msf->imsf_numsrc = gsf->gf_numsrc;
+ err = -EADDRNOTAVAIL;
+ for (i=0; i<gsf->gf_numsrc; ++i) {
+ psin = (struct sockaddr_in *)&gsf->gf_slist[i];
+
+ if (psin->sin_family != AF_INET)
+ goto mc_msf_out;
+ msf->imsf_slist[i] = psin->sin_addr.s_addr;
+ }
+ kfree(gsf);
+ gsf = NULL;
+
+ err = ip_mc_msfilter(sk, msf, ifindex);
+mc_msf_out:
+ if (msf)
+ kfree(msf);
+ if (gsf)
+ kfree(gsf);
+ break;
+ }
+ case IP_ROUTER_ALERT:
+ err = ip_ra_control(sk, val ? 1 : 0, NULL);
+ break;
+
+ case IP_FREEBIND:
+ if (optlen<1)
+ goto e_inval;
+ inet->freebind = !!val;
+ break;
+
+ case IP_IPSEC_POLICY:
+ case IP_XFRM_POLICY:
+ err = xfrm_user_policy(sk, optname, optval, optlen);
+ break;
+
+ default:
+#ifdef CONFIG_NETFILTER
+ err = nf_setsockopt(sk, PF_INET, optname, optval,
+ optlen);
+#else
+ err = -ENOPROTOOPT;
+#endif
+ break;
+ }
+ release_sock(sk);
+ return err;
+
+e_inval:
+ release_sock(sk);
+ return -EINVAL;
+}
+
+/*
+ * Get the options. Note for future reference. The GET of IP options gets the
+ * _received_ ones. The set sets the _sent_ ones.
+ */
+
+int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ int val;
+ int len;
+
+ if(level!=SOL_IP)
+ return -EOPNOTSUPP;
+
+#ifdef CONFIG_IP_MROUTE
+ if(optname>=MRT_BASE && optname <=MRT_BASE+10)
+ {
+ return ip_mroute_getsockopt(sk,optname,optval,optlen);
+ }
+#endif
+
+ if(get_user(len,optlen))
+ return -EFAULT;
+ if(len < 0)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ switch(optname) {
+ case IP_OPTIONS:
+ {
+ unsigned char optbuf[sizeof(struct ip_options)+40];
+ struct ip_options * opt = (struct ip_options*)optbuf;
+ opt->optlen = 0;
+ if (inet->opt)
+ memcpy(optbuf, inet->opt,
+ sizeof(struct ip_options)+
+ inet->opt->optlen);
+ release_sock(sk);
+
+ if (opt->optlen == 0)
+ return put_user(0, optlen);
+
+ ip_options_undo(opt);
+
+ len = min_t(unsigned int, len, opt->optlen);
+ if(put_user(len, optlen))
+ return -EFAULT;
+ if(copy_to_user(optval, opt->__data, len))
+ return -EFAULT;
+ return 0;
+ }
+ case IP_PKTINFO:
+ val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0;
+ break;
+ case IP_RECVTTL:
+ val = (inet->cmsg_flags & IP_CMSG_TTL) != 0;
+ break;
+ case IP_RECVTOS:
+ val = (inet->cmsg_flags & IP_CMSG_TOS) != 0;
+ break;
+ case IP_RECVOPTS:
+ val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0;
+ break;
+ case IP_RETOPTS:
+ val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0;
+ break;
+ case IP_TOS:
+ val = inet->tos;
+ break;
+ case IP_TTL:
+ val = (inet->uc_ttl == -1 ?
+ sysctl_ip_default_ttl :
+ inet->uc_ttl);
+ break;
+ case IP_HDRINCL:
+ val = inet->hdrincl;
+ break;
+ case IP_MTU_DISCOVER:
+ val = inet->pmtudisc;
+ break;
+ case IP_MTU:
+ {
+ struct dst_entry *dst;
+ val = 0;
+ dst = sk_dst_get(sk);
+ if (dst) {
+ val = dst_mtu(dst);
+ dst_release(dst);
+ }
+ if (!val) {
+ release_sock(sk);
+ return -ENOTCONN;
+ }
+ break;
+ }
+ case IP_RECVERR:
+ val = inet->recverr;
+ break;
+ case IP_MULTICAST_TTL:
+ val = inet->mc_ttl;
+ break;
+ case IP_MULTICAST_LOOP:
+ val = inet->mc_loop;
+ break;
+ case IP_MULTICAST_IF:
+ {
+ struct in_addr addr;
+ len = min_t(unsigned int, len, sizeof(struct in_addr));
+ addr.s_addr = inet->mc_addr;
+ release_sock(sk);
+
+ if(put_user(len, optlen))
+ return -EFAULT;
+ if(copy_to_user(optval, &addr, len))
+ return -EFAULT;
+ return 0;
+ }
+ case IP_MSFILTER:
+ {
+ struct ip_msfilter msf;
+ int err;
+
+ if (len < IP_MSFILTER_SIZE(0)) {
+ release_sock(sk);
+ return -EINVAL;
+ }
+ if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) {
+ release_sock(sk);
+ return -EFAULT;
+ }
+ err = ip_mc_msfget(sk, &msf,
+ (struct ip_msfilter __user *)optval, optlen);
+ release_sock(sk);
+ return err;
+ }
+ case MCAST_MSFILTER:
+ {
+ struct group_filter gsf;
+ int err;
+
+ if (len < GROUP_FILTER_SIZE(0)) {
+ release_sock(sk);
+ return -EINVAL;
+ }
+ if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) {
+ release_sock(sk);
+ return -EFAULT;
+ }
+ err = ip_mc_gsfget(sk, &gsf,
+ (struct group_filter __user *)optval, optlen);
+ release_sock(sk);
+ return err;
+ }
+ case IP_PKTOPTIONS:
+ {
+ struct msghdr msg;
+
+ release_sock(sk);
+
+ if (sk->sk_type != SOCK_STREAM)
+ return -ENOPROTOOPT;
+
+ msg.msg_control = optval;
+ msg.msg_controllen = len;
+ msg.msg_flags = 0;
+
+ if (inet->cmsg_flags & IP_CMSG_PKTINFO) {
+ struct in_pktinfo info;
+
+ info.ipi_addr.s_addr = inet->rcv_saddr;
+ info.ipi_spec_dst.s_addr = inet->rcv_saddr;
+ info.ipi_ifindex = inet->mc_index;
+ put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
+ }
+ if (inet->cmsg_flags & IP_CMSG_TTL) {
+ int hlim = inet->mc_ttl;
+ put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
+ }
+ len -= msg.msg_controllen;
+ return put_user(len, optlen);
+ }
+ case IP_FREEBIND:
+ val = inet->freebind;
+ break;
+ default:
+#ifdef CONFIG_NETFILTER
+ val = nf_getsockopt(sk, PF_INET, optname, optval,
+ &len);
+ release_sock(sk);
+ if (val >= 0)
+ val = put_user(len, optlen);
+ return val;
+#else
+ release_sock(sk);
+ return -ENOPROTOOPT;
+#endif
+ }
+ release_sock(sk);
+
+ if (len < sizeof(int) && len > 0 && val>=0 && val<255) {
+ unsigned char ucval = (unsigned char)val;
+ len = 1;
+ if(put_user(len, optlen))
+ return -EFAULT;
+ if(copy_to_user(optval,&ucval,1))
+ return -EFAULT;
+ } else {
+ len = min_t(unsigned int, sizeof(int), len);
+ if(put_user(len, optlen))
+ return -EFAULT;
+ if(copy_to_user(optval,&val,len))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+EXPORT_SYMBOL(ip_cmsg_recv);
+
+#ifdef CONFIG_IP_SCTP_MODULE
+EXPORT_SYMBOL(ip_getsockopt);
+EXPORT_SYMBOL(ip_setsockopt);
+#endif
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
new file mode 100644
index 000000000000..1a23c5263b99
--- /dev/null
+++ b/net/ipv4/ipcomp.c
@@ -0,0 +1,524 @@
+/*
+ * IP Payload Compression Protocol (IPComp) - RFC3173.
+ *
+ * Copyright (c) 2003 James Morris <jmorris@intercode.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Todo:
+ * - Tunable compression parameters.
+ * - Compression stats.
+ * - Adaptive compression.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/scatterlist.h>
+#include <asm/semaphore.h>
+#include <linux/crypto.h>
+#include <linux/pfkeyv2.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
+#include <linux/list.h>
+#include <linux/vmalloc.h>
+#include <linux/rtnetlink.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/icmp.h>
+#include <net/ipcomp.h>
+
+struct ipcomp_tfms {
+ struct list_head list;
+ struct crypto_tfm **tfms;
+ int users;
+};
+
+static DECLARE_MUTEX(ipcomp_resource_sem);
+static void **ipcomp_scratches;
+static int ipcomp_scratch_users;
+static LIST_HEAD(ipcomp_tfms_list);
+
+static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err, plen, dlen;
+ struct iphdr *iph;
+ struct ipcomp_data *ipcd = x->data;
+ u8 *start, *scratch;
+ struct crypto_tfm *tfm;
+ int cpu;
+
+ plen = skb->len;
+ dlen = IPCOMP_SCRATCH_SIZE;
+ start = skb->data;
+
+ cpu = get_cpu();
+ scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
+ tfm = *per_cpu_ptr(ipcd->tfms, cpu);
+
+ err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen);
+ if (err)
+ goto out;
+
+ if (dlen < (plen + sizeof(struct ip_comp_hdr))) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = pskb_expand_head(skb, 0, dlen - plen, GFP_ATOMIC);
+ if (err)
+ goto out;
+
+ skb_put(skb, dlen - plen);
+ memcpy(skb->data, scratch, dlen);
+ iph = skb->nh.iph;
+ iph->tot_len = htons(dlen + iph->ihl * 4);
+out:
+ put_cpu();
+ return err;
+}
+
+static int ipcomp_input(struct xfrm_state *x,
+ struct xfrm_decap_state *decap, struct sk_buff *skb)
+{
+ u8 nexthdr;
+ int err = 0;
+ struct iphdr *iph;
+ union {
+ struct iphdr iph;
+ char buf[60];
+ } tmp_iph;
+
+
+ if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
+ skb_linearize(skb, GFP_ATOMIC) != 0) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ /* Remove ipcomp header and decompress original payload */
+ iph = skb->nh.iph;
+ memcpy(&tmp_iph, iph, iph->ihl * 4);
+ nexthdr = *(u8 *)skb->data;
+ skb_pull(skb, sizeof(struct ip_comp_hdr));
+ skb->nh.raw += sizeof(struct ip_comp_hdr);
+ memcpy(skb->nh.raw, &tmp_iph, tmp_iph.iph.ihl * 4);
+ iph = skb->nh.iph;
+ iph->tot_len = htons(ntohs(iph->tot_len) - sizeof(struct ip_comp_hdr));
+ iph->protocol = nexthdr;
+ skb->h.raw = skb->data;
+ err = ipcomp_decompress(x, skb);
+
+out:
+ return err;
+}
+
+static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err, plen, dlen, ihlen;
+ struct iphdr *iph = skb->nh.iph;
+ struct ipcomp_data *ipcd = x->data;
+ u8 *start, *scratch;
+ struct crypto_tfm *tfm;
+ int cpu;
+
+ ihlen = iph->ihl * 4;
+ plen = skb->len - ihlen;
+ dlen = IPCOMP_SCRATCH_SIZE;
+ start = skb->data + ihlen;
+
+ cpu = get_cpu();
+ scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
+ tfm = *per_cpu_ptr(ipcd->tfms, cpu);
+
+ err = crypto_comp_compress(tfm, start, plen, scratch, &dlen);
+ if (err)
+ goto out;
+
+ if ((dlen + sizeof(struct ip_comp_hdr)) >= plen) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen);
+ put_cpu();
+
+ pskb_trim(skb, ihlen + dlen + sizeof(struct ip_comp_hdr));
+ return 0;
+
+out:
+ put_cpu();
+ return err;
+}
+
+static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+ struct iphdr *iph;
+ struct ip_comp_hdr *ipch;
+ struct ipcomp_data *ipcd = x->data;
+ int hdr_len = 0;
+
+ iph = skb->nh.iph;
+ iph->tot_len = htons(skb->len);
+ hdr_len = iph->ihl * 4;
+ if ((skb->len - hdr_len) < ipcd->threshold) {
+ /* Don't bother compressing */
+ goto out_ok;
+ }
+
+ if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
+ skb_linearize(skb, GFP_ATOMIC) != 0) {
+ goto out_ok;
+ }
+
+ err = ipcomp_compress(x, skb);
+ iph = skb->nh.iph;
+
+ if (err) {
+ goto out_ok;
+ }
+
+ /* Install ipcomp header, convert into ipcomp datagram. */
+ iph->tot_len = htons(skb->len);
+ ipch = (struct ip_comp_hdr *)((char *)iph + iph->ihl * 4);
+ ipch->nexthdr = iph->protocol;
+ ipch->flags = 0;
+ ipch->cpi = htons((u16 )ntohl(x->id.spi));
+ iph->protocol = IPPROTO_COMP;
+ ip_send_check(iph);
+ return 0;
+
+out_ok:
+ if (x->props.mode)
+ ip_send_check(iph);
+ return 0;
+}
+
+static void ipcomp4_err(struct sk_buff *skb, u32 info)
+{
+ u32 spi;
+ struct iphdr *iph = (struct iphdr *)skb->data;
+ struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
+ struct xfrm_state *x;
+
+ if (skb->h.icmph->type != ICMP_DEST_UNREACH ||
+ skb->h.icmph->code != ICMP_FRAG_NEEDED)
+ return;
+
+ spi = ntohl(ntohs(ipch->cpi));
+ x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr,
+ spi, IPPROTO_COMP, AF_INET);
+ if (!x)
+ return;
+ NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n",
+ spi, NIPQUAD(iph->daddr)));
+ xfrm_state_put(x);
+}
+
+/* We always hold one tunnel user reference to indicate a tunnel */
+static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
+{
+ struct xfrm_state *t;
+
+ t = xfrm_state_alloc();
+ if (t == NULL)
+ goto out;
+
+ t->id.proto = IPPROTO_IPIP;
+ t->id.spi = x->props.saddr.a4;
+ t->id.daddr.a4 = x->id.daddr.a4;
+ memcpy(&t->sel, &x->sel, sizeof(t->sel));
+ t->props.family = AF_INET;
+ t->props.mode = 1;
+ t->props.saddr.a4 = x->props.saddr.a4;
+ t->props.flags = x->props.flags;
+
+ t->type = xfrm_get_type(IPPROTO_IPIP, t->props.family);
+ if (t->type == NULL)
+ goto error;
+
+ if (t->type->init_state(t, NULL))
+ goto error;
+
+ t->km.state = XFRM_STATE_VALID;
+ atomic_set(&t->tunnel_users, 1);
+out:
+ return t;
+
+error:
+ t->km.state = XFRM_STATE_DEAD;
+ xfrm_state_put(t);
+ t = NULL;
+ goto out;
+}
+
+/*
+ * Must be protected by xfrm_cfg_sem. State and tunnel user references are
+ * always incremented on success.
+ */
+static int ipcomp_tunnel_attach(struct xfrm_state *x)
+{
+ int err = 0;
+ struct xfrm_state *t;
+
+ t = xfrm_state_lookup((xfrm_address_t *)&x->id.daddr.a4,
+ x->props.saddr.a4, IPPROTO_IPIP, AF_INET);
+ if (!t) {
+ t = ipcomp_tunnel_create(x);
+ if (!t) {
+ err = -EINVAL;
+ goto out;
+ }
+ xfrm_state_insert(t);
+ xfrm_state_hold(t);
+ }
+ x->tunnel = t;
+ atomic_inc(&t->tunnel_users);
+out:
+ return err;
+}
+
+static void ipcomp_free_scratches(void)
+{
+ int i;
+ void **scratches;
+
+ if (--ipcomp_scratch_users)
+ return;
+
+ scratches = ipcomp_scratches;
+ if (!scratches)
+ return;
+
+ for_each_cpu(i) {
+ void *scratch = *per_cpu_ptr(scratches, i);
+ if (scratch)
+ vfree(scratch);
+ }
+
+ free_percpu(scratches);
+}
+
+static void **ipcomp_alloc_scratches(void)
+{
+ int i;
+ void **scratches;
+
+ if (ipcomp_scratch_users++)
+ return ipcomp_scratches;
+
+ scratches = alloc_percpu(void *);
+ if (!scratches)
+ return NULL;
+
+ ipcomp_scratches = scratches;
+
+ for_each_cpu(i) {
+ void *scratch = vmalloc(IPCOMP_SCRATCH_SIZE);
+ if (!scratch)
+ return NULL;
+ *per_cpu_ptr(scratches, i) = scratch;
+ }
+
+ return scratches;
+}
+
+static void ipcomp_free_tfms(struct crypto_tfm **tfms)
+{
+ struct ipcomp_tfms *pos;
+ int cpu;
+
+ list_for_each_entry(pos, &ipcomp_tfms_list, list) {
+ if (pos->tfms == tfms)
+ break;
+ }
+
+ BUG_TRAP(pos);
+
+ if (--pos->users)
+ return;
+
+ list_del(&pos->list);
+ kfree(pos);
+
+ if (!tfms)
+ return;
+
+ for_each_cpu(cpu) {
+ struct crypto_tfm *tfm = *per_cpu_ptr(tfms, cpu);
+ if (tfm)
+ crypto_free_tfm(tfm);
+ }
+ free_percpu(tfms);
+}
+
+static struct crypto_tfm **ipcomp_alloc_tfms(const char *alg_name)
+{
+ struct ipcomp_tfms *pos;
+ struct crypto_tfm **tfms;
+ int cpu;
+
+ /* This can be any valid CPU ID so we don't need locking. */
+ cpu = smp_processor_id();
+
+ list_for_each_entry(pos, &ipcomp_tfms_list, list) {
+ struct crypto_tfm *tfm;
+
+ tfms = pos->tfms;
+ tfm = *per_cpu_ptr(tfms, cpu);
+
+ if (!strcmp(crypto_tfm_alg_name(tfm), alg_name)) {
+ pos->users++;
+ return tfms;
+ }
+ }
+
+ pos = kmalloc(sizeof(*pos), GFP_KERNEL);
+ if (!pos)
+ return NULL;
+
+ pos->users = 1;
+ INIT_LIST_HEAD(&pos->list);
+ list_add(&pos->list, &ipcomp_tfms_list);
+
+ pos->tfms = tfms = alloc_percpu(struct crypto_tfm *);
+ if (!tfms)
+ goto error;
+
+ for_each_cpu(cpu) {
+ struct crypto_tfm *tfm = crypto_alloc_tfm(alg_name, 0);
+ if (!tfm)
+ goto error;
+ *per_cpu_ptr(tfms, cpu) = tfm;
+ }
+
+ return tfms;
+
+error:
+ ipcomp_free_tfms(tfms);
+ return NULL;
+}
+
+static void ipcomp_free_data(struct ipcomp_data *ipcd)
+{
+ if (ipcd->tfms)
+ ipcomp_free_tfms(ipcd->tfms);
+ ipcomp_free_scratches();
+}
+
+static void ipcomp_destroy(struct xfrm_state *x)
+{
+ struct ipcomp_data *ipcd = x->data;
+ if (!ipcd)
+ return;
+ xfrm_state_delete_tunnel(x);
+ down(&ipcomp_resource_sem);
+ ipcomp_free_data(ipcd);
+ up(&ipcomp_resource_sem);
+ kfree(ipcd);
+}
+
+static int ipcomp_init_state(struct xfrm_state *x, void *args)
+{
+ int err;
+ struct ipcomp_data *ipcd;
+ struct xfrm_algo_desc *calg_desc;
+
+ err = -EINVAL;
+ if (!x->calg)
+ goto out;
+
+ if (x->encap)
+ goto out;
+
+ err = -ENOMEM;
+ ipcd = kmalloc(sizeof(*ipcd), GFP_KERNEL);
+ if (!ipcd)
+ goto out;
+
+ memset(ipcd, 0, sizeof(*ipcd));
+ x->props.header_len = 0;
+ if (x->props.mode)
+ x->props.header_len += sizeof(struct iphdr);
+
+ down(&ipcomp_resource_sem);
+ if (!ipcomp_alloc_scratches())
+ goto error;
+
+ ipcd->tfms = ipcomp_alloc_tfms(x->calg->alg_name);
+ if (!ipcd->tfms)
+ goto error;
+ up(&ipcomp_resource_sem);
+
+ if (x->props.mode) {
+ err = ipcomp_tunnel_attach(x);
+ if (err)
+ goto error_tunnel;
+ }
+
+ calg_desc = xfrm_calg_get_byname(x->calg->alg_name, 0);
+ BUG_ON(!calg_desc);
+ ipcd->threshold = calg_desc->uinfo.comp.threshold;
+ x->data = ipcd;
+ err = 0;
+out:
+ return err;
+
+error_tunnel:
+ down(&ipcomp_resource_sem);
+error:
+ ipcomp_free_data(ipcd);
+ up(&ipcomp_resource_sem);
+ kfree(ipcd);
+ goto out;
+}
+
+static struct xfrm_type ipcomp_type = {
+ .description = "IPCOMP4",
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_COMP,
+ .init_state = ipcomp_init_state,
+ .destructor = ipcomp_destroy,
+ .input = ipcomp_input,
+ .output = ipcomp_output
+};
+
+static struct net_protocol ipcomp4_protocol = {
+ .handler = xfrm4_rcv,
+ .err_handler = ipcomp4_err,
+ .no_policy = 1,
+};
+
+static int __init ipcomp4_init(void)
+{
+ if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) {
+ printk(KERN_INFO "ipcomp init: can't add xfrm type\n");
+ return -EAGAIN;
+ }
+ if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) {
+ printk(KERN_INFO "ipcomp init: can't add protocol\n");
+ xfrm_unregister_type(&ipcomp_type, AF_INET);
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static void __exit ipcomp4_fini(void)
+{
+ if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0)
+ printk(KERN_INFO "ip ipcomp close: can't remove protocol\n");
+ if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0)
+ printk(KERN_INFO "ip ipcomp close: can't remove xfrm type\n");
+}
+
+module_init(ipcomp4_init);
+module_exit(ipcomp4_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) - RFC3173");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
new file mode 100644
index 000000000000..f2509034ce72
--- /dev/null
+++ b/net/ipv4/ipconfig.c
@@ -0,0 +1,1507 @@
+/*
+ * $Id: ipconfig.c,v 1.46 2002/02/01 22:01:04 davem Exp $
+ *
+ * Automatic Configuration of IP -- use DHCP, BOOTP, RARP, or
+ * user-supplied information to configure own IP address and routes.
+ *
+ * Copyright (C) 1996-1998 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ * Derived from network configuration code in fs/nfs/nfsroot.c,
+ * originally Copyright (C) 1995, 1996 Gero Kuhlmann and me.
+ *
+ * BOOTP rewritten to construct and analyse packets itself instead
+ * of misusing the IP layer. num_bugs_causing_wrong_arp_replies--;
+ * -- MJ, December 1998
+ *
+ * Fixed ip_auto_config_setup calling at startup in the new "Linker Magic"
+ * initialization scheme.
+ * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>, 08/11/1999
+ *
+ * DHCP support added. To users this looks like a whole separate
+ * protocol, but we know it's just a bag on the side of BOOTP.
+ * -- Chip Salzenberg <chip@valinux.com>, May 2000
+ *
+ * Ported DHCP support from 2.2.16 to 2.4.0-test4
+ * -- Eric Biederman <ebiederman@lnxi.com>, 30 Aug 2000
+ *
+ * Merged changes from 2.2.19 into 2.4.3
+ * -- Eric Biederman <ebiederman@lnxi.com>, 22 April Aug 2001
+ *
+ * Multiple Nameservers in /proc/net/pnp
+ * -- Josef Siemes <jsiemes@web.de>, Aug 2002
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/socket.h>
+#include <linux/route.h>
+#include <linux/udp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/major.h>
+#include <linux/root_dev.h>
+#include <linux/delay.h>
+#include <net/arp.h>
+#include <net/ip.h>
+#include <net/ipconfig.h>
+
+#include <asm/uaccess.h>
+#include <net/checksum.h>
+#include <asm/processor.h>
+
+/* Define this to allow debugging output */
+#undef IPCONFIG_DEBUG
+
+#ifdef IPCONFIG_DEBUG
+#define DBG(x) printk x
+#else
+#define DBG(x) do { } while(0)
+#endif
+
+#if defined(CONFIG_IP_PNP_DHCP)
+#define IPCONFIG_DHCP
+#endif
+#if defined(CONFIG_IP_PNP_BOOTP) || defined(CONFIG_IP_PNP_DHCP)
+#define IPCONFIG_BOOTP
+#endif
+#if defined(CONFIG_IP_PNP_RARP)
+#define IPCONFIG_RARP
+#endif
+#if defined(IPCONFIG_BOOTP) || defined(IPCONFIG_RARP)
+#define IPCONFIG_DYNAMIC
+#endif
+
+/* Define the friendly delay before and after opening net devices */
+#define CONF_PRE_OPEN 500 /* Before opening: 1/2 second */
+#define CONF_POST_OPEN 1 /* After opening: 1 second */
+
+/* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */
+#define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */
+#define CONF_SEND_RETRIES 6 /* Send six requests per open */
+#define CONF_INTER_TIMEOUT (HZ/2) /* Inter-device timeout: 1/2 second */
+#define CONF_BASE_TIMEOUT (HZ*2) /* Initial timeout: 2 seconds */
+#define CONF_TIMEOUT_RANDOM (HZ) /* Maximum amount of randomization */
+#define CONF_TIMEOUT_MULT *7/4 /* Rate of timeout growth */
+#define CONF_TIMEOUT_MAX (HZ*30) /* Maximum allowed timeout */
+#define CONF_NAMESERVERS_MAX 3 /* Maximum number of nameservers
+ - '3' from resolv.h */
+
+
+/*
+ * Public IP configuration
+ */
+
+/* This is used by platforms which might be able to set the ipconfig
+ * variables using firmware environment vars. If this is set, it will
+ * ignore such firmware variables.
+ */
+int ic_set_manually __initdata = 0; /* IPconfig parameters set manually */
+
+static int ic_enable __initdata = 0; /* IP config enabled? */
+
+/* Protocol choice */
+int ic_proto_enabled __initdata = 0
+#ifdef IPCONFIG_BOOTP
+ | IC_BOOTP
+#endif
+#ifdef CONFIG_IP_PNP_DHCP
+ | IC_USE_DHCP
+#endif
+#ifdef IPCONFIG_RARP
+ | IC_RARP
+#endif
+ ;
+
+static int ic_host_name_set __initdata = 0; /* Host name set by us? */
+
+u32 ic_myaddr = INADDR_NONE; /* My IP address */
+static u32 ic_netmask = INADDR_NONE; /* Netmask for local subnet */
+u32 ic_gateway = INADDR_NONE; /* Gateway IP address */
+
+u32 ic_servaddr = INADDR_NONE; /* Boot server IP address */
+
+u32 root_server_addr = INADDR_NONE; /* Address of NFS server */
+u8 root_server_path[256] = { 0, }; /* Path to mount as root */
+
+/* Persistent data: */
+
+static int ic_proto_used; /* Protocol used, if any */
+static u32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */
+static u8 ic_domain[64]; /* DNS (not NIS) domain name */
+
+/*
+ * Private state.
+ */
+
+/* Name of user-selected boot device */
+static char user_dev_name[IFNAMSIZ] __initdata = { 0, };
+
+/* Protocols supported by available interfaces */
+static int ic_proto_have_if __initdata = 0;
+
+#ifdef IPCONFIG_DYNAMIC
+static DEFINE_SPINLOCK(ic_recv_lock);
+static volatile int ic_got_reply __initdata = 0; /* Proto(s) that replied */
+#endif
+#ifdef IPCONFIG_DHCP
+static int ic_dhcp_msgtype __initdata = 0; /* DHCP msg type received */
+#endif
+
+
+/*
+ * Network devices
+ */
+
+struct ic_device {
+ struct ic_device *next;
+ struct net_device *dev;
+ unsigned short flags;
+ short able;
+ u32 xid;
+};
+
+static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */
+static struct net_device *ic_dev __initdata = NULL; /* Selected device */
+
+static int __init ic_open_devs(void)
+{
+ struct ic_device *d, **last;
+ struct net_device *dev;
+ unsigned short oflags;
+
+ last = &ic_first_dev;
+ rtnl_shlock();
+
+ /* bring loopback device up first */
+ if (dev_change_flags(&loopback_dev, loopback_dev.flags | IFF_UP) < 0)
+ printk(KERN_ERR "IP-Config: Failed to open %s\n", loopback_dev.name);
+
+ for (dev = dev_base; dev; dev = dev->next) {
+ if (dev == &loopback_dev)
+ continue;
+ if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
+ (!(dev->flags & IFF_LOOPBACK) &&
+ (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
+ strncmp(dev->name, "dummy", 5))) {
+ int able = 0;
+ if (dev->mtu >= 364)
+ able |= IC_BOOTP;
+ else
+ printk(KERN_WARNING "DHCP/BOOTP: Ignoring device %s, MTU %d too small", dev->name, dev->mtu);
+ if (!(dev->flags & IFF_NOARP))
+ able |= IC_RARP;
+ able &= ic_proto_enabled;
+ if (ic_proto_enabled && !able)
+ continue;
+ oflags = dev->flags;
+ if (dev_change_flags(dev, oflags | IFF_UP) < 0) {
+ printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name);
+ continue;
+ }
+ if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) {
+ rtnl_shunlock();
+ return -1;
+ }
+ d->dev = dev;
+ *last = d;
+ last = &d->next;
+ d->flags = oflags;
+ d->able = able;
+ if (able & IC_BOOTP)
+ get_random_bytes(&d->xid, sizeof(u32));
+ else
+ d->xid = 0;
+ ic_proto_have_if |= able;
+ DBG(("IP-Config: %s UP (able=%d, xid=%08x)\n",
+ dev->name, able, d->xid));
+ }
+ }
+ rtnl_shunlock();
+
+ *last = NULL;
+
+ if (!ic_first_dev) {
+ if (user_dev_name[0])
+ printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name);
+ else
+ printk(KERN_ERR "IP-Config: No network devices available.\n");
+ return -1;
+ }
+ return 0;
+}
+
+static void __init ic_close_devs(void)
+{
+ struct ic_device *d, *next;
+ struct net_device *dev;
+
+ rtnl_shlock();
+ next = ic_first_dev;
+ while ((d = next)) {
+ next = d->next;
+ dev = d->dev;
+ if (dev != ic_dev) {
+ DBG(("IP-Config: Downing %s\n", dev->name));
+ dev_change_flags(dev, d->flags);
+ }
+ kfree(d);
+ }
+ rtnl_shunlock();
+}
+
+/*
+ * Interface to various network functions.
+ */
+
+static inline void
+set_sockaddr(struct sockaddr_in *sin, u32 addr, u16 port)
+{
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = addr;
+ sin->sin_port = port;
+}
+
+static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg)
+{
+ int res;
+
+ mm_segment_t oldfs = get_fs();
+ set_fs(get_ds());
+ res = devinet_ioctl(cmd, (struct ifreq __user *) arg);
+ set_fs(oldfs);
+ return res;
+}
+
+static int __init ic_route_ioctl(unsigned int cmd, struct rtentry *arg)
+{
+ int res;
+
+ mm_segment_t oldfs = get_fs();
+ set_fs(get_ds());
+ res = ip_rt_ioctl(cmd, (void __user *) arg);
+ set_fs(oldfs);
+ return res;
+}
+
+/*
+ * Set up interface addresses and routes.
+ */
+
+static int __init ic_setup_if(void)
+{
+ struct ifreq ir;
+ struct sockaddr_in *sin = (void *) &ir.ifr_ifru.ifru_addr;
+ int err;
+
+ memset(&ir, 0, sizeof(ir));
+ strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name);
+ set_sockaddr(sin, ic_myaddr, 0);
+ if ((err = ic_dev_ioctl(SIOCSIFADDR, &ir)) < 0) {
+ printk(KERN_ERR "IP-Config: Unable to set interface address (%d).\n", err);
+ return -1;
+ }
+ set_sockaddr(sin, ic_netmask, 0);
+ if ((err = ic_dev_ioctl(SIOCSIFNETMASK, &ir)) < 0) {
+ printk(KERN_ERR "IP-Config: Unable to set interface netmask (%d).\n", err);
+ return -1;
+ }
+ set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0);
+ if ((err = ic_dev_ioctl(SIOCSIFBRDADDR, &ir)) < 0) {
+ printk(KERN_ERR "IP-Config: Unable to set interface broadcast address (%d).\n", err);
+ return -1;
+ }
+ return 0;
+}
+
+static int __init ic_setup_routes(void)
+{
+ /* No need to setup device routes, only the default route... */
+
+ if (ic_gateway != INADDR_NONE) {
+ struct rtentry rm;
+ int err;
+
+ memset(&rm, 0, sizeof(rm));
+ if ((ic_gateway ^ ic_myaddr) & ic_netmask) {
+ printk(KERN_ERR "IP-Config: Gateway not on directly connected network.\n");
+ return -1;
+ }
+ set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0);
+ set_sockaddr((struct sockaddr_in *) &rm.rt_genmask, 0, 0);
+ set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0);
+ rm.rt_flags = RTF_UP | RTF_GATEWAY;
+ if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) {
+ printk(KERN_ERR "IP-Config: Cannot add default route (%d).\n", err);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Fill in default values for all missing parameters.
+ */
+
+static int __init ic_defaults(void)
+{
+ /*
+ * At this point we have no userspace running so need not
+ * claim locks on system_utsname
+ */
+
+ if (!ic_host_name_set)
+ sprintf(system_utsname.nodename, "%u.%u.%u.%u", NIPQUAD(ic_myaddr));
+
+ if (root_server_addr == INADDR_NONE)
+ root_server_addr = ic_servaddr;
+
+ if (ic_netmask == INADDR_NONE) {
+ if (IN_CLASSA(ntohl(ic_myaddr)))
+ ic_netmask = htonl(IN_CLASSA_NET);
+ else if (IN_CLASSB(ntohl(ic_myaddr)))
+ ic_netmask = htonl(IN_CLASSB_NET);
+ else if (IN_CLASSC(ntohl(ic_myaddr)))
+ ic_netmask = htonl(IN_CLASSC_NET);
+ else {
+ printk(KERN_ERR "IP-Config: Unable to guess netmask for address %u.%u.%u.%u\n",
+ NIPQUAD(ic_myaddr));
+ return -1;
+ }
+ printk("IP-Config: Guessing netmask %u.%u.%u.%u\n", NIPQUAD(ic_netmask));
+ }
+
+ return 0;
+}
+
+/*
+ * RARP support.
+ */
+
+#ifdef IPCONFIG_RARP
+
+static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt);
+
+static struct packet_type rarp_packet_type __initdata = {
+ .type = __constant_htons(ETH_P_RARP),
+ .func = ic_rarp_recv,
+};
+
+static inline void ic_rarp_init(void)
+{
+ dev_add_pack(&rarp_packet_type);
+}
+
+static inline void ic_rarp_cleanup(void)
+{
+ dev_remove_pack(&rarp_packet_type);
+}
+
+/*
+ * Process received RARP packet.
+ */
+static int __init
+ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+{
+ struct arphdr *rarp;
+ unsigned char *rarp_ptr;
+ unsigned long sip, tip;
+ unsigned char *sha, *tha; /* s for "source", t for "target" */
+ struct ic_device *d;
+
+ if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+ return NET_RX_DROP;
+
+ if (!pskb_may_pull(skb, sizeof(struct arphdr)))
+ goto drop;
+
+ /* Basic sanity checks can be done without the lock. */
+ rarp = (struct arphdr *)skb->h.raw;
+
+ /* If this test doesn't pass, it's not IP, or we should
+ * ignore it anyway.
+ */
+ if (rarp->ar_hln != dev->addr_len || dev->type != ntohs(rarp->ar_hrd))
+ goto drop;
+
+ /* If it's not a RARP reply, delete it. */
+ if (rarp->ar_op != htons(ARPOP_RREPLY))
+ goto drop;
+
+ /* If it's not Ethernet, delete it. */
+ if (rarp->ar_pro != htons(ETH_P_IP))
+ goto drop;
+
+ if (!pskb_may_pull(skb,
+ sizeof(struct arphdr) +
+ (2 * dev->addr_len) +
+ (2 * 4)))
+ goto drop;
+
+ /* OK, it is all there and looks valid, process... */
+ rarp = (struct arphdr *)skb->h.raw;
+ rarp_ptr = (unsigned char *) (rarp + 1);
+
+ /* One reply at a time, please. */
+ spin_lock(&ic_recv_lock);
+
+ /* If we already have a reply, just drop the packet */
+ if (ic_got_reply)
+ goto drop_unlock;
+
+ /* Find the ic_device that the packet arrived on */
+ d = ic_first_dev;
+ while (d && d->dev != dev)
+ d = d->next;
+ if (!d)
+ goto drop_unlock; /* should never happen */
+
+ /* Extract variable-width fields */
+ sha = rarp_ptr;